dstack 0.19.4rc3__py3-none-any.whl → 0.19.6rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (183) hide show
  1. dstack/_internal/cli/commands/attach.py +22 -20
  2. dstack/_internal/cli/commands/offer.py +116 -0
  3. dstack/_internal/cli/main.py +2 -0
  4. dstack/_internal/cli/services/configurators/base.py +1 -2
  5. dstack/_internal/cli/services/configurators/fleet.py +43 -20
  6. dstack/_internal/cli/services/configurators/run.py +3 -3
  7. dstack/_internal/cli/utils/run.py +43 -38
  8. dstack/_internal/core/backends/aws/auth.py +1 -2
  9. dstack/_internal/core/backends/aws/compute.py +24 -9
  10. dstack/_internal/core/backends/aws/configurator.py +2 -3
  11. dstack/_internal/core/backends/aws/resources.py +10 -0
  12. dstack/_internal/core/backends/azure/auth.py +1 -2
  13. dstack/_internal/core/backends/azure/compute.py +15 -5
  14. dstack/_internal/core/backends/azure/configurator.py +4 -5
  15. dstack/_internal/core/backends/azure/resources.py +14 -0
  16. dstack/_internal/core/backends/base/compute.py +99 -31
  17. dstack/_internal/core/backends/gcp/auth.py +1 -2
  18. dstack/_internal/core/backends/gcp/compute.py +58 -14
  19. dstack/_internal/core/backends/gcp/configurator.py +2 -3
  20. dstack/_internal/core/backends/gcp/features/tcpx.py +31 -0
  21. dstack/_internal/core/backends/gcp/resources.py +10 -0
  22. dstack/_internal/core/backends/nebius/compute.py +6 -2
  23. dstack/_internal/core/backends/nebius/configurator.py +4 -10
  24. dstack/_internal/core/backends/nebius/models.py +14 -1
  25. dstack/_internal/core/backends/nebius/resources.py +91 -10
  26. dstack/_internal/core/backends/oci/auth.py +1 -2
  27. dstack/_internal/core/backends/oci/configurator.py +1 -2
  28. dstack/_internal/core/backends/runpod/compute.py +1 -1
  29. dstack/_internal/core/errors.py +4 -0
  30. dstack/_internal/core/models/common.py +2 -14
  31. dstack/_internal/core/models/configurations.py +24 -2
  32. dstack/_internal/core/models/envs.py +2 -2
  33. dstack/_internal/core/models/fleets.py +34 -3
  34. dstack/_internal/core/models/gateways.py +18 -4
  35. dstack/_internal/core/models/instances.py +2 -1
  36. dstack/_internal/core/models/profiles.py +12 -0
  37. dstack/_internal/core/models/runs.py +6 -0
  38. dstack/_internal/core/models/secrets.py +1 -1
  39. dstack/_internal/core/models/volumes.py +17 -1
  40. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +3 -3
  41. dstack/_internal/proxy/gateway/services/nginx.py +0 -1
  42. dstack/_internal/proxy/gateway/services/registry.py +0 -1
  43. dstack/_internal/server/background/tasks/process_instances.py +12 -9
  44. dstack/_internal/server/background/tasks/process_running_jobs.py +66 -15
  45. dstack/_internal/server/routers/fleets.py +22 -0
  46. dstack/_internal/server/routers/runs.py +1 -0
  47. dstack/_internal/server/schemas/fleets.py +12 -2
  48. dstack/_internal/server/schemas/runner.py +6 -0
  49. dstack/_internal/server/schemas/runs.py +3 -0
  50. dstack/_internal/server/services/docker.py +1 -2
  51. dstack/_internal/server/services/fleets.py +30 -12
  52. dstack/_internal/server/services/gateways/__init__.py +1 -0
  53. dstack/_internal/server/services/instances.py +3 -1
  54. dstack/_internal/server/services/jobs/__init__.py +1 -2
  55. dstack/_internal/server/services/jobs/configurators/base.py +17 -8
  56. dstack/_internal/server/services/locking.py +16 -1
  57. dstack/_internal/server/services/projects.py +1 -2
  58. dstack/_internal/server/services/proxy/repo.py +1 -2
  59. dstack/_internal/server/services/runner/client.py +3 -0
  60. dstack/_internal/server/services/runs.py +19 -16
  61. dstack/_internal/server/services/services/__init__.py +1 -2
  62. dstack/_internal/server/services/volumes.py +29 -2
  63. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  64. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  65. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  66. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  67. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  68. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  69. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  70. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  71. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  72. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  73. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  74. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  75. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  76. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  77. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  78. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  79. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  80. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  81. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  82. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  83. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  84. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  85. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  86. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  87. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  88. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  89. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  90. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  91. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  92. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  93. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  94. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  95. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  96. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  97. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  98. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  99. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  100. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  101. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  102. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  103. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  104. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  105. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  106. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  107. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  108. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  109. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  110. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  111. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  112. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  113. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  114. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  115. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  116. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  117. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  118. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  119. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  120. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  121. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  122. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  123. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  124. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  125. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  126. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  127. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  128. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  129. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  130. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  131. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  132. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  133. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  134. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  135. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  136. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  137. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  138. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  139. dstack/_internal/server/statics/assets/manifest.webmanifest +67 -0
  140. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  141. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  142. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  143. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  144. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  145. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  146. dstack/_internal/server/statics/assets/yandex-browser-manifest.json +9 -0
  147. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  148. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  149. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  150. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  151. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  152. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  153. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  154. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  155. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  156. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  157. dstack/_internal/server/statics/index.html +3 -0
  158. dstack/_internal/server/statics/main-8f9c66f404e9c7e7e020.css +3 -0
  159. dstack/_internal/server/statics/main-b4f65323f5df007e1664.js +136480 -0
  160. dstack/_internal/server/statics/main-b4f65323f5df007e1664.js.map +1 -0
  161. dstack/_internal/server/statics/manifest.json +16 -0
  162. dstack/_internal/server/statics/robots.txt +3 -0
  163. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  164. dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +3 -0
  165. dstack/_internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg +124 -0
  166. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  167. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  168. dstack/_internal/server/testing/common.py +10 -0
  169. dstack/_internal/utils/tags.py +42 -0
  170. dstack/api/server/__init__.py +3 -1
  171. dstack/api/server/_fleets.py +52 -9
  172. dstack/api/server/_gateways.py +17 -2
  173. dstack/api/server/_runs.py +34 -11
  174. dstack/api/server/_volumes.py +2 -3
  175. dstack/version.py +1 -1
  176. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/METADATA +2 -2
  177. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/RECORD +180 -76
  178. dstack-0.19.4rc3.data/data/dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +0 -1
  179. dstack-0.19.4rc3.data/data/dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +0 -27
  180. dstack-0.19.4rc3.data/data/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +0 -88
  181. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/WHEEL +0 -0
  182. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/entry_points.txt +0 -0
  183. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -10,7 +10,7 @@ from sqlalchemy.orm import joinedload
10
10
  from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
11
11
  from dstack._internal.core.errors import GatewayError
12
12
  from dstack._internal.core.models.backends.base import BackendType
13
- from dstack._internal.core.models.common import NetworkMode, RegistryAuth, is_core_model_instance
13
+ from dstack._internal.core.models.common import NetworkMode, RegistryAuth
14
14
  from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
15
15
  from dstack._internal.core.models.instances import (
16
16
  InstanceStatus,
@@ -40,7 +40,7 @@ from dstack._internal.server.models import (
40
40
  RepoModel,
41
41
  RunModel,
42
42
  )
43
- from dstack._internal.server.schemas.runner import TaskStatus
43
+ from dstack._internal.server.schemas.runner import GPUDevice, TaskStatus
44
44
  from dstack._internal.server.services import logs as logs_services
45
45
  from dstack._internal.server.services import services
46
46
  from dstack._internal.server.services.instances import get_instance_ssh_private_keys
@@ -422,9 +422,9 @@ def _process_provisioning_with_shim(
422
422
  volume_mounts: List[VolumeMountPoint] = []
423
423
  instance_mounts: List[InstanceMountPoint] = []
424
424
  for mount in run.run_spec.configuration.volumes:
425
- if is_core_model_instance(mount, VolumeMountPoint):
425
+ if isinstance(mount, VolumeMountPoint):
426
426
  volume_mounts.append(mount.copy())
427
- elif is_core_model_instance(mount, InstanceMountPoint):
427
+ elif isinstance(mount, InstanceMountPoint):
428
428
  instance_mounts.append(mount)
429
429
  else:
430
430
  assert False, f"unexpected mount point: {mount!r}"
@@ -438,6 +438,10 @@ def _process_provisioning_with_shim(
438
438
  job_provisioning_data.backend, job_provisioning_data.instance_type.name
439
439
  )
440
440
 
441
+ gpu_devices = _get_instance_specific_gpu_devices(
442
+ job_provisioning_data.backend, job_provisioning_data.instance_type.name
443
+ )
444
+
441
445
  container_user = "root"
442
446
 
443
447
  job_runtime_data = get_job_runtime_data(job_model)
@@ -471,6 +475,7 @@ def _process_provisioning_with_shim(
471
475
  volumes=volumes,
472
476
  volume_mounts=volume_mounts,
473
477
  instance_mounts=instance_mounts,
478
+ gpu_devices=gpu_devices,
474
479
  host_ssh_user=ssh_user,
475
480
  host_ssh_keys=[ssh_key] if ssh_key else [],
476
481
  container_ssh_keys=public_keys,
@@ -657,7 +662,7 @@ def _terminate_if_inactivity_duration_exceeded(
657
662
  run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
658
663
  ) -> None:
659
664
  conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
660
- if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
665
+ if not isinstance(conf, DevEnvironmentConfiguration) or not isinstance(
661
666
  conf.inactivity_duration, int
662
667
  ):
663
668
  # reset in case inactivity_duration was disabled via in-place update
@@ -834,14 +839,60 @@ def _submit_job_to_runner(
834
839
  def _get_instance_specific_mounts(
835
840
  backend_type: BackendType, instance_type_name: str
836
841
  ) -> List[InstanceMountPoint]:
837
- if backend_type == BackendType.GCP and instance_type_name == "a3-megagpu-8g":
838
- return [
839
- InstanceMountPoint(
840
- instance_path="/dev/aperture_devices", path="/dev/aperture_devices"
841
- ),
842
- InstanceMountPoint(instance_path="/var/lib/tcpxo/lib64", path="/var/lib/tcpxo/lib64"),
843
- InstanceMountPoint(
844
- instance_path="/var/lib/fastrak/lib64", path="/var/lib/fastrak/lib64"
845
- ),
846
- ]
842
+ if backend_type == BackendType.GCP:
843
+ if instance_type_name == "a3-megagpu-8g":
844
+ return [
845
+ InstanceMountPoint(
846
+ instance_path="/dev/aperture_devices",
847
+ path="/dev/aperture_devices",
848
+ ),
849
+ InstanceMountPoint(
850
+ instance_path="/var/lib/tcpxo/lib64",
851
+ path="/var/lib/tcpxo/lib64",
852
+ ),
853
+ InstanceMountPoint(
854
+ instance_path="/var/lib/fastrak/lib64",
855
+ path="/var/lib/fastrak/lib64",
856
+ ),
857
+ ]
858
+ if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
859
+ return [
860
+ InstanceMountPoint(
861
+ instance_path="/var/lib/nvidia/lib64",
862
+ path="/usr/local/nvidia/lib64",
863
+ ),
864
+ InstanceMountPoint(
865
+ instance_path="/var/lib/nvidia/bin",
866
+ path="/usr/local/nvidia/bin",
867
+ ),
868
+ InstanceMountPoint(
869
+ instance_path="/var/lib/tcpx/lib64",
870
+ path="/usr/local/tcpx/lib64",
871
+ ),
872
+ InstanceMountPoint(
873
+ instance_path="/run/tcpx",
874
+ path="/run/tcpx",
875
+ ),
876
+ ]
847
877
  return []
878
+
879
+
880
+ def _get_instance_specific_gpu_devices(
881
+ backend_type: BackendType, instance_type_name: str
882
+ ) -> List[GPUDevice]:
883
+ gpu_devices = []
884
+ if backend_type == BackendType.GCP and instance_type_name in [
885
+ "a3-edgegpu-8g",
886
+ "a3-highgpu-8g",
887
+ ]:
888
+ for i in range(8):
889
+ gpu_devices.append(
890
+ GPUDevice(path_on_host=f"/dev/nvidia{i}", path_in_container=f"/dev/nvidia{i}")
891
+ )
892
+ gpu_devices.append(
893
+ GPUDevice(path_on_host="/dev/nvidia-uvm", path_in_container="/dev/nvidia-uvm")
894
+ )
895
+ gpu_devices.append(
896
+ GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl")
897
+ )
898
+ return gpu_devices
@@ -9,6 +9,7 @@ from dstack._internal.core.models.fleets import Fleet, FleetPlan
9
9
  from dstack._internal.server.db import get_session
10
10
  from dstack._internal.server.models import ProjectModel, UserModel
11
11
  from dstack._internal.server.schemas.fleets import (
12
+ ApplyFleetPlanRequest,
12
13
  CreateFleetRequest,
13
14
  DeleteFleetInstancesRequest,
14
15
  DeleteFleetsRequest,
@@ -107,6 +108,27 @@ async def get_plan(
107
108
  return plan
108
109
 
109
110
 
111
+ @project_router.post("/apply")
112
+ async def apply_plan(
113
+ body: ApplyFleetPlanRequest,
114
+ session: AsyncSession = Depends(get_session),
115
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
116
+ ) -> Fleet:
117
+ """
118
+ Creates a new fleet or updates an existing fleet.
119
+ Errors if the expected current resource from the plan does not match the current resource.
120
+ Use `force: true` to apply even if the current resource does not match.
121
+ """
122
+ user, project = user_project
123
+ return await fleets_services.apply_plan(
124
+ session=session,
125
+ user=user,
126
+ project=project,
127
+ plan=body.plan,
128
+ force=body.force,
129
+ )
130
+
131
+
110
132
  @project_router.post("/create")
111
133
  async def create_fleet(
112
134
  body: CreateFleetRequest,
@@ -100,6 +100,7 @@ async def get_plan(
100
100
  project=project,
101
101
  user=user,
102
102
  run_spec=body.run_spec,
103
+ max_offers=body.max_offers,
103
104
  )
104
105
  return run_plan
105
106
 
@@ -1,11 +1,11 @@
1
1
  from datetime import datetime
2
- from typing import List, Optional
2
+ from typing import Annotated, List, Optional
3
3
  from uuid import UUID
4
4
 
5
5
  from pydantic import Field
6
6
 
7
7
  from dstack._internal.core.models.common import CoreModel
8
- from dstack._internal.core.models.fleets import FleetSpec
8
+ from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec
9
9
 
10
10
 
11
11
  class ListFleetsRequest(CoreModel):
@@ -26,6 +26,16 @@ class GetFleetPlanRequest(CoreModel):
26
26
  spec: FleetSpec
27
27
 
28
28
 
29
+ class ApplyFleetPlanRequest(CoreModel):
30
+ plan: ApplyFleetPlanInput
31
+ force: Annotated[
32
+ bool,
33
+ Field(
34
+ description="Use `force: true` to apply even if the expected resource does not match."
35
+ ),
36
+ ]
37
+
38
+
29
39
  class CreateFleetRequest(CoreModel):
30
40
  spec: FleetSpec
31
41
 
@@ -114,6 +114,11 @@ class TaskStatus(str, Enum):
114
114
  TERMINATED = "terminated"
115
115
 
116
116
 
117
+ class GPUDevice(CoreModel):
118
+ path_on_host: str
119
+ path_in_container: str
120
+
121
+
117
122
  class TaskInfoResponse(CoreModel):
118
123
  id: str
119
124
  status: TaskStatus
@@ -139,6 +144,7 @@ class TaskSubmitRequest(CoreModel):
139
144
  volumes: list[ShimVolumeInfo]
140
145
  volume_mounts: list[VolumeMountPoint]
141
146
  instance_mounts: list[InstanceMountPoint]
147
+ gpu_devices: list[GPUDevice]
142
148
  host_ssh_user: str
143
149
  host_ssh_keys: list[str]
144
150
  container_ssh_keys: list[str]
@@ -26,6 +26,9 @@ class GetRunRequest(CoreModel):
26
26
 
27
27
  class GetRunPlanRequest(CoreModel):
28
28
  run_spec: RunSpec
29
+ max_offers: Optional[int] = Field(
30
+ description="The maximum number of offers to return", ge=1, le=10000
31
+ )
29
32
 
30
33
 
31
34
  class SubmitRunRequest(CoreModel):
@@ -91,8 +91,7 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) ->
91
91
  config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE)
92
92
  if config_resp is None:
93
93
  raise DockerRegistryError(
94
- "Image config object exceeds the size limit of "
95
- f"{MAX_CONFIG_OBJECT_SIZE} bytes"
94
+ f"Image config object exceeds the size limit of {MAX_CONFIG_OBJECT_SIZE} bytes"
96
95
  )
97
96
  return ImageConfigObject.__response__.parse_raw(config_resp)
98
97
 
@@ -15,9 +15,9 @@ from dstack._internal.core.errors import (
15
15
  ResourceExistsError,
16
16
  ServerClientError,
17
17
  )
18
- from dstack._internal.core.models.common import is_core_model_instance
19
18
  from dstack._internal.core.models.envs import Env
20
19
  from dstack._internal.core.models.fleets import (
20
+ ApplyFleetPlanInput,
21
21
  Fleet,
22
22
  FleetPlan,
23
23
  FleetSpec,
@@ -234,32 +234,34 @@ async def get_plan(
234
234
  user: UserModel,
235
235
  spec: FleetSpec,
236
236
  ) -> FleetPlan:
237
+ effective_spec = FleetSpec.parse_obj(spec.dict())
237
238
  current_fleet: Optional[Fleet] = None
238
239
  current_fleet_id: Optional[uuid.UUID] = None
239
- if spec.configuration.name is not None:
240
+ if effective_spec.configuration.name is not None:
240
241
  current_fleet_model = await get_project_fleet_model_by_name(
241
- session=session, project=project, name=spec.configuration.name
242
+ session=session, project=project, name=effective_spec.configuration.name
242
243
  )
243
244
  if current_fleet_model is not None:
244
245
  current_fleet = fleet_model_to_fleet(current_fleet_model)
245
246
  current_fleet_id = current_fleet_model.id
246
- await _check_ssh_hosts_not_yet_added(session, spec, current_fleet_id)
247
+ await _check_ssh_hosts_not_yet_added(session, effective_spec, current_fleet_id)
247
248
 
248
249
  offers = []
249
- if spec.configuration.ssh_config is None:
250
+ if effective_spec.configuration.ssh_config is None:
250
251
  offers_with_backends = await get_create_instance_offers(
251
252
  project=project,
252
- profile=spec.merged_profile,
253
- requirements=_get_fleet_requirements(spec),
254
- fleet_spec=spec,
255
- blocks=spec.configuration.blocks,
253
+ profile=effective_spec.merged_profile,
254
+ requirements=_get_fleet_requirements(effective_spec),
255
+ fleet_spec=effective_spec,
256
+ blocks=effective_spec.configuration.blocks,
256
257
  )
257
258
  offers = [offer for _, offer in offers_with_backends]
258
- _remove_fleet_spec_sensitive_info(spec)
259
+ _remove_fleet_spec_sensitive_info(effective_spec)
259
260
  plan = FleetPlan(
260
261
  project_name=project.name,
261
262
  user=user.name,
262
263
  spec=spec,
264
+ effective_spec=effective_spec,
263
265
  current_resource=current_fleet,
264
266
  offers=offers[:50],
265
267
  total_offers=len(offers),
@@ -307,6 +309,21 @@ async def get_create_instance_offers(
307
309
  return offers
308
310
 
309
311
 
312
+ async def apply_plan(
313
+ session: AsyncSession,
314
+ user: UserModel,
315
+ project: ProjectModel,
316
+ plan: ApplyFleetPlanInput,
317
+ force: bool,
318
+ ) -> Fleet:
319
+ return await create_fleet(
320
+ session=session,
321
+ project=project,
322
+ user=user,
323
+ spec=plan.spec,
324
+ )
325
+
326
+
310
327
  async def create_fleet(
311
328
  session: AsyncSession,
312
329
  project: ProjectModel,
@@ -320,7 +337,7 @@ async def create_fleet(
320
337
 
321
338
  lock_namespace = f"fleet_names_{project.name}"
322
339
  if get_db().dialect_name == "sqlite":
323
- # Start new transaction to see commited changes after lock
340
+ # Start new transaction to see committed changes after lock
324
341
  await session.commit()
325
342
  elif get_db().dialect_name == "postgresql":
326
343
  await session.execute(
@@ -402,6 +419,7 @@ async def create_fleet_instance_model(
402
419
  placement_group_name=placement_group_name,
403
420
  reservation=reservation,
404
421
  blocks=spec.configuration.blocks,
422
+ tags=spec.configuration.tags,
405
423
  )
406
424
  return instance_model
407
425
 
@@ -629,7 +647,7 @@ def _validate_fleet_spec(spec: FleetSpec):
629
647
  if spec.configuration.ssh_config.ssh_key is not None:
630
648
  _validate_ssh_key(spec.configuration.ssh_config.ssh_key)
631
649
  for host in spec.configuration.ssh_config.hosts:
632
- if is_core_model_instance(host, SSHHostParams) and host.ssh_key is not None:
650
+ if isinstance(host, SSHHostParams) and host.ssh_key is not None:
633
651
  _validate_ssh_key(host.ssh_key)
634
652
  _validate_internal_ips(spec.configuration.ssh_config)
635
653
 
@@ -106,6 +106,7 @@ async def create_gateway_compute(
106
106
  public_ip=configuration.public_ip,
107
107
  ssh_key_pub=gateway_ssh_public_key,
108
108
  certificate=configuration.certificate,
109
+ tags=configuration.tags,
109
110
  )
110
111
 
111
112
  gpd = await run_async(
@@ -1,7 +1,7 @@
1
1
  import uuid
2
2
  from collections.abc import Container, Iterable
3
3
  from datetime import datetime, timezone
4
- from typing import List, Literal, Optional, Union
4
+ from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import gpuhunt
7
7
  from sqlalchemy import and_, or_, select
@@ -411,6 +411,7 @@ async def create_instance_model(
411
411
  placement_group_name: Optional[str],
412
412
  reservation: Optional[str],
413
413
  blocks: Union[Literal["auto"], int],
414
+ tags: Optional[Dict[str, str]],
414
415
  ) -> InstanceModel:
415
416
  termination_policy, termination_idle_time = get_termination(
416
417
  profile, DEFAULT_FLEET_TERMINATION_IDLE_TIME
@@ -428,6 +429,7 @@ async def create_instance_model(
428
429
  instance_id=str(instance_id),
429
430
  placement_group_name=placement_group_name,
430
431
  reservation=reservation,
432
+ tags=tags,
431
433
  )
432
434
  instance = InstanceModel(
433
435
  id=instance_id,
@@ -20,7 +20,6 @@ from dstack._internal.core.errors import (
20
20
  SSHError,
21
21
  )
22
22
  from dstack._internal.core.models.backends.base import BackendType
23
- from dstack._internal.core.models.common import is_core_model_instance
24
23
  from dstack._internal.core.models.configurations import RunConfigurationType
25
24
  from dstack._internal.core.models.instances import InstanceStatus
26
25
  from dstack._internal.core.models.runs import (
@@ -585,7 +584,7 @@ async def get_job_configured_volume_models(
585
584
  job_volumes = interpolate_job_volumes(run_spec.configuration.volumes, job_num)
586
585
  volume_models = []
587
586
  for mount_point in job_volumes:
588
- if not is_core_model_instance(mount_point, VolumeMountPoint):
587
+ if not isinstance(mount_point, VolumeMountPoint):
589
588
  continue
590
589
  if isinstance(mount_point.name, str):
591
590
  names = [mount_point.name]
@@ -1,13 +1,14 @@
1
1
  import shlex
2
2
  import sys
3
3
  from abc import ABC, abstractmethod
4
+ from pathlib import PurePosixPath
4
5
  from typing import Dict, List, Optional, Union
5
6
 
6
7
  from cachetools import TTLCache, cached
7
8
 
8
9
  import dstack.version as version
9
10
  from dstack._internal.core.errors import DockerRegistryError, ServerClientError
10
- from dstack._internal.core.models.common import RegistryAuth, is_core_model_instance
11
+ from dstack._internal.core.models.common import RegistryAuth
11
12
  from dstack._internal.core.models.configurations import (
12
13
  PortMapping,
13
14
  PythonVersion,
@@ -131,16 +132,24 @@ class JobConfigurator(ABC):
131
132
  )
132
133
  return job_spec
133
134
 
135
+ def _shell(self) -> str:
136
+ shell = self.run_spec.configuration.shell
137
+ if shell is not None:
138
+ path = PurePosixPath(shell)
139
+ if path.is_absolute():
140
+ return shell
141
+ return str("/bin" / path)
142
+ if self.run_spec.configuration.image is None: # dstackai/base
143
+ return "/bin/bash"
144
+ return "/bin/sh"
145
+
134
146
  async def _commands(self) -> List[str]:
135
147
  if self.run_spec.configuration.entrypoint is not None: # docker-like format
136
148
  entrypoint = shlex.split(self.run_spec.configuration.entrypoint)
137
149
  commands = self.run_spec.configuration.commands
138
- elif self.run_spec.configuration.image is None: # dstackai/base
139
- entrypoint = ["/bin/bash", "-i", "-c"]
140
- commands = [_join_shell_commands(self._shell_commands())]
141
- elif self._shell_commands(): # custom docker image with shell commands
142
- entrypoint = ["/bin/sh", "-i", "-c"]
143
- commands = [_join_shell_commands(self._shell_commands())]
150
+ elif shell_commands := self._shell_commands():
151
+ entrypoint = [self._shell(), "-i", "-c"]
152
+ commands = [_join_shell_commands(shell_commands)]
144
153
  else: # custom docker image without commands
145
154
  image_config = await self._get_image_config()
146
155
  entrypoint = image_config.entrypoint or []
@@ -274,7 +283,7 @@ def interpolate_job_volumes(
274
283
  if isinstance(mount_point, str):
275
284
  # pydantic validator ensures strings are converted to MountPoint
276
285
  continue
277
- if not is_core_model_instance(mount_point, VolumeMountPoint):
286
+ if not isinstance(mount_point, VolumeMountPoint):
278
287
  job_volumes.append(mount_point.copy())
279
288
  continue
280
289
  if isinstance(mount_point.name, str):
@@ -2,7 +2,7 @@ import asyncio
2
2
  import hashlib
3
3
  from asyncio import Lock
4
4
  from contextlib import asynccontextmanager
5
- from typing import Dict, List, Set, Tuple, TypeVar, Union
5
+ from typing import AsyncGenerator, Dict, List, Set, Tuple, TypeVar, Union
6
6
 
7
7
  from sqlalchemy import func, select
8
8
  from sqlalchemy.ext.asyncio import AsyncConnection, AsyncSession
@@ -52,6 +52,21 @@ async def advisory_lock_ctx(
52
52
  await bind.execute(select(func.pg_advisory_unlock(string_to_lock_id(resource))))
53
53
 
54
54
 
55
+ @asynccontextmanager
56
+ async def try_advisory_lock_ctx(
57
+ bind: Union[AsyncConnection, AsyncSession], dialect_name: str, resource: str
58
+ ) -> AsyncGenerator[bool, None]:
59
+ locked = True
60
+ if dialect_name == "postgresql":
61
+ res = await bind.execute(select(func.pg_try_advisory_lock(string_to_lock_id(resource))))
62
+ locked = res.scalar_one()
63
+ try:
64
+ yield locked
65
+ finally:
66
+ if dialect_name == "postgresql" and locked:
67
+ await bind.execute(select(func.pg_advisory_unlock(string_to_lock_id(resource))))
68
+
69
+
55
70
  _locker = ResourceLocker()
56
71
 
57
72
 
@@ -14,7 +14,6 @@ from dstack._internal.core.backends.dstack.models import (
14
14
  )
15
15
  from dstack._internal.core.backends.models import BackendInfo
16
16
  from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
17
- from dstack._internal.core.models.common import is_core_model_instance
18
17
  from dstack._internal.core.models.projects import Member, MemberPermissions, Project
19
18
  from dstack._internal.core.models.users import GlobalRole, ProjectRole
20
19
  from dstack._internal.server.models import MemberModel, ProjectModel, UserModel
@@ -386,7 +385,7 @@ def project_model_to_project(
386
385
  backend_config = get_backend_config_from_backend_model(
387
386
  configurator, b, include_creds=False
388
387
  )
389
- if is_core_model_instance(backend_config, DstackBackendConfig):
388
+ if isinstance(backend_config, DstackBackendConfig):
390
389
  for backend_type in backend_config.base_backends:
391
390
  backends.append(
392
391
  BackendInfo(
@@ -7,7 +7,6 @@ from sqlalchemy.orm import joinedload
7
7
 
8
8
  import dstack._internal.server.services.jobs as jobs_services
9
9
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
10
- from dstack._internal.core.models.common import is_core_model_instance
11
10
  from dstack._internal.core.models.configurations import ServiceConfiguration
12
11
  from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
13
12
  from dstack._internal.core.models.runs import (
@@ -64,7 +63,7 @@ class ServerProxyRepo(BaseProxyRepo):
64
63
  return None
65
64
  run = jobs[0].run
66
65
  run_spec = RunSpec.__response__.parse_raw(run.run_spec)
67
- if not is_core_model_instance(run_spec.configuration, ServiceConfiguration):
66
+ if not isinstance(run_spec.configuration, ServiceConfiguration):
68
67
  return None
69
68
  replicas = []
70
69
  for job in jobs:
@@ -15,6 +15,7 @@ from dstack._internal.core.models.resources import Memory
15
15
  from dstack._internal.core.models.runs import ClusterInfo, JobSpec, RunSpec
16
16
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
17
17
  from dstack._internal.server.schemas.runner import (
18
+ GPUDevice,
18
19
  HealthcheckResponse,
19
20
  LegacyPullResponse,
20
21
  LegacyStopBody,
@@ -233,6 +234,7 @@ class ShimClient:
233
234
  volumes: list[Volume],
234
235
  volume_mounts: list[VolumeMountPoint],
235
236
  instance_mounts: list[InstanceMountPoint],
237
+ gpu_devices: list[GPUDevice],
236
238
  host_ssh_user: str,
237
239
  host_ssh_keys: list[str],
238
240
  container_ssh_keys: list[str],
@@ -256,6 +258,7 @@ class ShimClient:
256
258
  volumes=[_volume_to_shim_volume_info(v, instance_id) for v in volumes],
257
259
  volume_mounts=volume_mounts,
258
260
  instance_mounts=instance_mounts,
261
+ gpu_devices=gpu_devices,
259
262
  host_ssh_user=host_ssh_user,
260
263
  host_ssh_keys=host_ssh_keys,
261
264
  container_ssh_keys=container_ssh_keys,
@@ -15,7 +15,7 @@ from dstack._internal.core.errors import (
15
15
  ResourceNotExistsError,
16
16
  ServerClientError,
17
17
  )
18
- from dstack._internal.core.models.common import ApplyAction, is_core_model_instance
18
+ from dstack._internal.core.models.common import ApplyAction
19
19
  from dstack._internal.core.models.configurations import AnyRunConfiguration
20
20
  from dstack._internal.core.models.instances import (
21
21
  InstanceAvailability,
@@ -92,6 +92,8 @@ JOB_TERMINATION_REASONS_TO_RETRY = {
92
92
  JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY,
93
93
  }
94
94
 
95
+ DEFAULT_MAX_OFFERS = 50
96
+
95
97
 
96
98
  async def list_user_runs(
97
99
  session: AsyncSession,
@@ -275,46 +277,46 @@ async def get_plan(
275
277
  project: ProjectModel,
276
278
  user: UserModel,
277
279
  run_spec: RunSpec,
280
+ max_offers: Optional[int],
278
281
  ) -> RunPlan:
279
- _validate_run_spec_and_set_defaults(run_spec)
282
+ effective_run_spec = RunSpec.parse_obj(run_spec.dict())
283
+ _validate_run_spec_and_set_defaults(effective_run_spec)
280
284
 
281
- profile = run_spec.merged_profile
285
+ profile = effective_run_spec.merged_profile
282
286
  creation_policy = profile.creation_policy
283
287
 
284
288
  current_resource = None
285
289
  action = ApplyAction.CREATE
286
- if run_spec.run_name is not None:
290
+ if effective_run_spec.run_name is not None:
287
291
  current_resource = await get_run_by_name(
288
292
  session=session,
289
293
  project=project,
290
- run_name=run_spec.run_name,
294
+ run_name=effective_run_spec.run_name,
291
295
  )
292
296
  if (
293
297
  current_resource is not None
294
298
  and not current_resource.status.is_finished()
295
- and _can_update_run_spec(current_resource.run_spec, run_spec)
299
+ and _can_update_run_spec(current_resource.run_spec, effective_run_spec)
296
300
  ):
297
301
  action = ApplyAction.UPDATE
298
302
 
299
- # TODO(egor-s): do we need to generate all replicas here?
300
- jobs = await get_jobs_from_run_spec(run_spec, replica_num=0)
303
+ jobs = await get_jobs_from_run_spec(effective_run_spec, replica_num=0)
301
304
 
302
305
  volumes = await get_job_configured_volumes(
303
306
  session=session,
304
307
  project=project,
305
- run_spec=run_spec,
308
+ run_spec=effective_run_spec,
306
309
  job_num=0,
307
310
  )
308
311
 
309
312
  pool_offers = await _get_pool_offers(
310
313
  session=session,
311
314
  project=project,
312
- run_spec=run_spec,
315
+ run_spec=effective_run_spec,
313
316
  job=jobs[0],
314
317
  volumes=volumes,
315
318
  )
316
- run_name = run_spec.run_name # preserve run_name
317
- run_spec.run_name = "dry-run" # will regenerate jobs on submission
319
+ effective_run_spec.run_name = "dry-run" # will regenerate jobs on submission
318
320
 
319
321
  # Get offers once for all jobs
320
322
  offers = []
@@ -327,7 +329,7 @@ async def get_plan(
327
329
  multinode=jobs[0].job_spec.jobs_per_replica > 1,
328
330
  volumes=volumes,
329
331
  privileged=jobs[0].job_spec.privileged,
330
- instance_mounts=check_run_spec_requires_instance_mounts(run_spec),
332
+ instance_mounts=check_run_spec_requires_instance_mounts(effective_run_spec),
331
333
  )
332
334
 
333
335
  job_plans = []
@@ -342,17 +344,18 @@ async def get_plan(
342
344
 
343
345
  job_plan = JobPlan(
344
346
  job_spec=job_spec,
345
- offers=job_offers[:50],
347
+ offers=job_offers[: (max_offers or DEFAULT_MAX_OFFERS)],
346
348
  total_offers=len(job_offers),
347
349
  max_price=max((offer.price for offer in job_offers), default=None),
348
350
  )
349
351
  job_plans.append(job_plan)
350
352
 
351
- run_spec.run_name = run_name # restore run_name
353
+ effective_run_spec.run_name = run_spec.run_name # restore run_name
352
354
  run_plan = RunPlan(
353
355
  project_name=project.name,
354
356
  user=user.name,
355
357
  run_spec=run_spec,
358
+ effective_run_spec=effective_run_spec,
356
359
  job_plans=job_plans,
357
360
  current_resource=current_resource,
358
361
  action=action,
@@ -748,7 +751,7 @@ async def _generate_run_name(
748
751
 
749
752
  def check_run_spec_requires_instance_mounts(run_spec: RunSpec) -> bool:
750
753
  return any(
751
- is_core_model_instance(mp, InstanceMountPoint) and not mp.optional
754
+ isinstance(mp, InstanceMountPoint) and not mp.optional
752
755
  for mp in run_spec.configuration.volumes
753
756
  )
754
757