dstack 0.19.25rc1__py3-none-any.whl → 0.19.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (161) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -2
  2. dstack/_internal/cli/commands/apply.py +3 -61
  3. dstack/_internal/cli/commands/attach.py +1 -1
  4. dstack/_internal/cli/commands/completion.py +1 -1
  5. dstack/_internal/cli/commands/delete.py +2 -2
  6. dstack/_internal/cli/commands/fleet.py +1 -1
  7. dstack/_internal/cli/commands/gateway.py +2 -2
  8. dstack/_internal/cli/commands/init.py +56 -24
  9. dstack/_internal/cli/commands/logs.py +1 -1
  10. dstack/_internal/cli/commands/metrics.py +1 -1
  11. dstack/_internal/cli/commands/offer.py +45 -7
  12. dstack/_internal/cli/commands/project.py +2 -2
  13. dstack/_internal/cli/commands/secrets.py +2 -2
  14. dstack/_internal/cli/commands/server.py +1 -1
  15. dstack/_internal/cli/commands/stop.py +1 -1
  16. dstack/_internal/cli/commands/volume.py +1 -1
  17. dstack/_internal/cli/main.py +2 -2
  18. dstack/_internal/cli/services/completion.py +2 -2
  19. dstack/_internal/cli/services/configurators/__init__.py +6 -2
  20. dstack/_internal/cli/services/configurators/base.py +6 -7
  21. dstack/_internal/cli/services/configurators/fleet.py +1 -3
  22. dstack/_internal/cli/services/configurators/gateway.py +2 -4
  23. dstack/_internal/cli/services/configurators/run.py +293 -58
  24. dstack/_internal/cli/services/configurators/volume.py +2 -4
  25. dstack/_internal/cli/services/profile.py +1 -1
  26. dstack/_internal/cli/services/repos.py +35 -48
  27. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  28. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  29. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  30. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  31. dstack/_internal/core/backends/aws/compute.py +6 -1
  32. dstack/_internal/core/backends/aws/configurator.py +11 -7
  33. dstack/_internal/core/backends/azure/configurator.py +11 -7
  34. dstack/_internal/core/backends/base/compute.py +33 -5
  35. dstack/_internal/core/backends/base/configurator.py +25 -13
  36. dstack/_internal/core/backends/base/offers.py +2 -0
  37. dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
  38. dstack/_internal/core/backends/configurators.py +15 -0
  39. dstack/_internal/core/backends/cudo/configurator.py +11 -7
  40. dstack/_internal/core/backends/datacrunch/compute.py +5 -1
  41. dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
  42. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  43. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  44. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  45. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  46. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  47. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  48. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  49. dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
  50. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  51. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  52. dstack/_internal/core/backends/gcp/compute.py +32 -8
  53. dstack/_internal/core/backends/gcp/configurator.py +11 -7
  54. dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
  55. dstack/_internal/core/backends/hotaisle/compute.py +1 -6
  56. dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
  57. dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
  58. dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
  59. dstack/_internal/core/backends/models.py +7 -0
  60. dstack/_internal/core/backends/nebius/compute.py +1 -8
  61. dstack/_internal/core/backends/nebius/configurator.py +11 -7
  62. dstack/_internal/core/backends/nebius/resources.py +21 -11
  63. dstack/_internal/core/backends/oci/compute.py +4 -5
  64. dstack/_internal/core/backends/oci/configurator.py +11 -7
  65. dstack/_internal/core/backends/runpod/configurator.py +11 -7
  66. dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
  67. dstack/_internal/core/backends/tensordock/configurator.py +13 -7
  68. dstack/_internal/core/backends/vastai/configurator.py +11 -7
  69. dstack/_internal/core/backends/vultr/compute.py +1 -5
  70. dstack/_internal/core/backends/vultr/configurator.py +11 -4
  71. dstack/_internal/core/compatibility/fleets.py +5 -0
  72. dstack/_internal/core/compatibility/gpus.py +13 -0
  73. dstack/_internal/core/compatibility/runs.py +9 -1
  74. dstack/_internal/core/models/backends/base.py +5 -1
  75. dstack/_internal/core/models/common.py +3 -3
  76. dstack/_internal/core/models/configurations.py +191 -32
  77. dstack/_internal/core/models/files.py +1 -1
  78. dstack/_internal/core/models/fleets.py +80 -3
  79. dstack/_internal/core/models/profiles.py +41 -11
  80. dstack/_internal/core/models/resources.py +46 -42
  81. dstack/_internal/core/models/runs.py +28 -5
  82. dstack/_internal/core/services/configs/__init__.py +6 -3
  83. dstack/_internal/core/services/profiles.py +2 -2
  84. dstack/_internal/core/services/repos.py +86 -79
  85. dstack/_internal/core/services/ssh/ports.py +1 -1
  86. dstack/_internal/proxy/lib/deps.py +6 -2
  87. dstack/_internal/server/app.py +22 -17
  88. dstack/_internal/server/background/tasks/process_fleets.py +109 -13
  89. dstack/_internal/server/background/tasks/process_gateways.py +4 -1
  90. dstack/_internal/server/background/tasks/process_instances.py +22 -73
  91. dstack/_internal/server/background/tasks/process_probes.py +1 -1
  92. dstack/_internal/server/background/tasks/process_running_jobs.py +12 -4
  93. dstack/_internal/server/background/tasks/process_runs.py +3 -1
  94. dstack/_internal/server/background/tasks/process_submitted_jobs.py +67 -44
  95. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  96. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  97. dstack/_internal/server/db.py +8 -4
  98. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  99. dstack/_internal/server/models.py +6 -2
  100. dstack/_internal/server/routers/gpus.py +1 -6
  101. dstack/_internal/server/schemas/runner.py +11 -0
  102. dstack/_internal/server/services/backends/__init__.py +14 -8
  103. dstack/_internal/server/services/backends/handlers.py +6 -1
  104. dstack/_internal/server/services/docker.py +5 -5
  105. dstack/_internal/server/services/fleets.py +37 -38
  106. dstack/_internal/server/services/gateways/__init__.py +2 -0
  107. dstack/_internal/server/services/gateways/client.py +5 -2
  108. dstack/_internal/server/services/gateways/connection.py +1 -1
  109. dstack/_internal/server/services/gpus.py +50 -49
  110. dstack/_internal/server/services/instances.py +44 -4
  111. dstack/_internal/server/services/jobs/__init__.py +15 -4
  112. dstack/_internal/server/services/jobs/configurators/base.py +53 -17
  113. dstack/_internal/server/services/jobs/configurators/dev.py +9 -4
  114. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +6 -8
  115. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +7 -9
  116. dstack/_internal/server/services/jobs/configurators/service.py +1 -3
  117. dstack/_internal/server/services/jobs/configurators/task.py +3 -3
  118. dstack/_internal/server/services/locking.py +5 -5
  119. dstack/_internal/server/services/logging.py +10 -2
  120. dstack/_internal/server/services/logs/__init__.py +8 -6
  121. dstack/_internal/server/services/logs/aws.py +330 -327
  122. dstack/_internal/server/services/logs/filelog.py +7 -6
  123. dstack/_internal/server/services/logs/gcp.py +141 -139
  124. dstack/_internal/server/services/plugins.py +1 -1
  125. dstack/_internal/server/services/projects.py +2 -5
  126. dstack/_internal/server/services/proxy/repo.py +5 -1
  127. dstack/_internal/server/services/requirements/__init__.py +0 -0
  128. dstack/_internal/server/services/requirements/combine.py +259 -0
  129. dstack/_internal/server/services/runner/client.py +7 -0
  130. dstack/_internal/server/services/runs.py +17 -1
  131. dstack/_internal/server/services/services/__init__.py +8 -2
  132. dstack/_internal/server/services/services/autoscalers.py +2 -0
  133. dstack/_internal/server/services/ssh.py +2 -1
  134. dstack/_internal/server/services/storage/__init__.py +5 -6
  135. dstack/_internal/server/services/storage/gcs.py +49 -49
  136. dstack/_internal/server/services/storage/s3.py +52 -52
  137. dstack/_internal/server/statics/index.html +1 -1
  138. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
  139. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
  140. dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
  141. dstack/_internal/server/testing/common.py +7 -4
  142. dstack/_internal/server/utils/logging.py +3 -3
  143. dstack/_internal/server/utils/provisioning.py +3 -3
  144. dstack/_internal/utils/json_schema.py +3 -1
  145. dstack/_internal/utils/path.py +8 -1
  146. dstack/_internal/utils/ssh.py +7 -0
  147. dstack/_internal/utils/typing.py +14 -0
  148. dstack/api/_public/repos.py +62 -8
  149. dstack/api/_public/runs.py +19 -8
  150. dstack/api/server/__init__.py +17 -19
  151. dstack/api/server/_gpus.py +2 -1
  152. dstack/api/server/_group.py +4 -3
  153. dstack/api/server/_repos.py +20 -3
  154. dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
  155. dstack/version.py +1 -1
  156. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
  157. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/RECORD +160 -142
  158. dstack/api/huggingface/__init__.py +0 -73
  159. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
  160. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
  161. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
@@ -93,6 +93,8 @@ async def create_gateway_compute(
93
93
  backend_id: Optional[uuid.UUID] = None,
94
94
  ) -> GatewayComputeModel:
95
95
  assert isinstance(backend_compute, ComputeWithGatewaySupport)
96
+ assert configuration.name is not None
97
+
96
98
  private_bytes, public_bytes = generate_rsa_key_pair_bytes()
97
99
  gateway_ssh_private_key = private_bytes.decode()
98
100
  gateway_ssh_public_key = public_bytes.decode()
@@ -7,7 +7,7 @@ from pydantic import parse_obj_as
7
7
 
8
8
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
9
9
  from dstack._internal.core.errors import GatewayError
10
- from dstack._internal.core.models.configurations import RateLimit, ServiceConfiguration
10
+ from dstack._internal.core.models.configurations import RateLimit
11
11
  from dstack._internal.core.models.instances import SSHConnectionParams
12
12
  from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port
13
13
  from dstack._internal.proxy.gateway.schemas.stats import ServiceStats
@@ -85,7 +85,7 @@ class GatewayClient:
85
85
  ssh_head_proxy: Optional[SSHConnectionParams],
86
86
  ssh_head_proxy_private_key: Optional[str],
87
87
  ):
88
- assert isinstance(run.run_spec.configuration, ServiceConfiguration)
88
+ assert run.run_spec.configuration.type == "service"
89
89
  payload = {
90
90
  "job_id": job_submission.id.hex,
91
91
  "app_port": get_service_port(job_spec, run.run_spec.configuration),
@@ -93,6 +93,9 @@ class GatewayClient:
93
93
  "ssh_head_proxy_private_key": ssh_head_proxy_private_key,
94
94
  }
95
95
  jpd = job_submission.job_provisioning_data
96
+ assert jpd is not None
97
+ assert jpd.hostname is not None
98
+ assert jpd.ssh_port is not None
96
99
  if not jpd.dockerized:
97
100
  payload.update(
98
101
  {
@@ -67,7 +67,7 @@ class GatewayConnection:
67
67
  # reverse_forwarded_sockets are added later in .open()
68
68
  )
69
69
  self.tunnel_id = uuid.uuid4()
70
- self._client = GatewayClient(uds=self.gateway_socket_path)
70
+ self._client = GatewayClient(uds=str(self.gateway_socket_path))
71
71
 
72
72
  @staticmethod
73
73
  def _init_symlink_dir(connection_dir: Path) -> Tuple[TemporaryDirectory, Path]:
@@ -1,8 +1,8 @@
1
1
  from typing import Dict, List, Literal, Optional, Tuple
2
2
 
3
- from sqlalchemy.ext.asyncio import AsyncSession
4
-
5
3
  from dstack._internal.core.backends.base.backend import Backend
4
+ from dstack._internal.core.errors import ServerClientError
5
+ from dstack._internal.core.models.backends.base import BackendType
6
6
  from dstack._internal.core.models.instances import InstanceOfferWithAvailability
7
7
  from dstack._internal.core.models.profiles import SpotPolicy
8
8
  from dstack._internal.core.models.resources import Range
@@ -15,10 +15,43 @@ from dstack._internal.server.schemas.gpus import (
15
15
  ListGpusResponse,
16
16
  )
17
17
  from dstack._internal.server.services.offers import get_offers_by_requirements
18
+ from dstack._internal.utils.common import get_or_error
19
+
20
+
21
+ async def list_gpus_grouped(
22
+ project: ProjectModel,
23
+ run_spec: RunSpec,
24
+ group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
25
+ ) -> ListGpusResponse:
26
+ """Retrieves available GPU specifications based on a run spec, with optional grouping."""
27
+ offers = await _get_gpu_offers(project=project, run_spec=run_spec)
28
+ backend_gpus = _process_offers_into_backend_gpus(offers)
29
+ group_by_set = set(group_by) if group_by else set()
30
+ if "region" in group_by_set and "backend" not in group_by_set:
31
+ raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
32
+
33
+ # Determine grouping strategy based on combination
34
+ has_backend = "backend" in group_by_set
35
+ has_region = "region" in group_by_set
36
+ has_count = "count" in group_by_set
37
+ if has_backend and has_region and has_count:
38
+ gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
39
+ elif has_backend and has_count:
40
+ gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
41
+ elif has_backend and has_region:
42
+ gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
43
+ elif has_backend:
44
+ gpus = _get_gpus_grouped_by_backend(backend_gpus)
45
+ elif has_count:
46
+ gpus = _get_gpus_grouped_by_count(backend_gpus)
47
+ else:
48
+ gpus = _get_gpus_with_no_grouping(backend_gpus)
49
+
50
+ return ListGpusResponse(gpus=gpus)
18
51
 
19
52
 
20
53
  async def _get_gpu_offers(
21
- session: AsyncSession, project: ProjectModel, run_spec: RunSpec
54
+ project: ProjectModel, run_spec: RunSpec
22
55
  ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
23
56
  """Fetches all available instance offers that match the run spec's GPU requirements."""
24
57
  profile = run_spec.merged_profile
@@ -28,7 +61,6 @@ async def _get_gpu_offers(
28
61
  spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
29
62
  reservation=profile.reservation,
30
63
  )
31
-
32
64
  return await get_offers_by_requirements(
33
65
  project=project,
34
66
  profile=profile,
@@ -45,10 +77,10 @@ def _process_offers_into_backend_gpus(
45
77
  offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
46
78
  ) -> List[BackendGpus]:
47
79
  """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
48
- backend_data: Dict[str, Dict] = {}
80
+ backend_data: Dict[BackendType, Dict] = {}
49
81
 
50
- for backend, offer in offers:
51
- backend_type = backend.TYPE
82
+ for _, offer in offers:
83
+ backend_type = offer.backend
52
84
  if backend_type not in backend_data:
53
85
  backend_data[backend_type] = {"gpus": {}, "regions": set()}
54
86
 
@@ -111,7 +143,7 @@ def _process_offers_into_backend_gpus(
111
143
  return backend_gpus_list
112
144
 
113
145
 
114
- def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
146
+ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: BackendType):
115
147
  """Updates an existing GpuGroup with new data from another GPU offer."""
116
148
  spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
117
149
 
@@ -122,6 +154,12 @@ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
122
154
  if row.backends and backend_type not in row.backends:
123
155
  row.backends.append(backend_type)
124
156
 
157
+ # FIXME: Consider using non-optional range
158
+ assert row.count.min is not None
159
+ assert row.count.max is not None
160
+ assert row.price.min is not None
161
+ assert row.price.max is not None
162
+
125
163
  row.count.min = min(row.count.min, gpu.count)
126
164
  row.count.max = max(row.count.max, gpu.count)
127
165
  per_gpu_price = gpu.price / gpu.count
@@ -194,7 +232,7 @@ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGro
194
232
  not any(av.is_available() for av in g.availability),
195
233
  g.price.min,
196
234
  g.price.max,
197
- g.backend.value,
235
+ get_or_error(g.backend).value,
198
236
  g.name,
199
237
  g.memory_mib,
200
238
  ),
@@ -229,7 +267,7 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) ->
229
267
  not any(av.is_available() for av in g.availability),
230
268
  g.price.min,
231
269
  g.price.max,
232
- g.backend.value,
270
+ get_or_error(g.backend).value,
233
271
  g.region,
234
272
  g.name,
235
273
  g.memory_mib,
@@ -299,7 +337,7 @@ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> L
299
337
  not any(av.is_available() for av in g.availability),
300
338
  g.price.min,
301
339
  g.price.max,
302
- g.backend.value,
340
+ get_or_error(g.backend).value,
303
341
  g.count.min,
304
342
  g.name,
305
343
  g.memory_mib,
@@ -344,47 +382,10 @@ def _get_gpus_grouped_by_backend_region_and_count(
344
382
  not any(av.is_available() for av in g.availability),
345
383
  g.price.min,
346
384
  g.price.max,
347
- g.backend.value,
385
+ get_or_error(g.backend).value,
348
386
  g.region,
349
387
  g.count.min,
350
388
  g.name,
351
389
  g.memory_mib,
352
390
  ),
353
391
  )
354
-
355
-
356
- async def list_gpus_grouped(
357
- session: AsyncSession,
358
- project: ProjectModel,
359
- run_spec: RunSpec,
360
- group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
361
- ) -> ListGpusResponse:
362
- """Retrieves available GPU specifications based on a run spec, with optional grouping."""
363
- offers = await _get_gpu_offers(session, project, run_spec)
364
- backend_gpus = _process_offers_into_backend_gpus(offers)
365
-
366
- group_by_set = set(group_by) if group_by else set()
367
-
368
- if "region" in group_by_set and "backend" not in group_by_set:
369
- from dstack._internal.core.errors import ServerClientError
370
-
371
- raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
372
-
373
- # Determine grouping strategy based on combination
374
- has_backend = "backend" in group_by_set
375
- has_region = "region" in group_by_set
376
- has_count = "count" in group_by_set
377
- if has_backend and has_region and has_count:
378
- gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
379
- elif has_backend and has_count:
380
- gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
381
- elif has_backend and has_region:
382
- gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
383
- elif has_backend:
384
- gpus = _get_gpus_grouped_by_backend(backend_gpus)
385
- elif has_count:
386
- gpus = _get_gpus_grouped_by_count(backend_gpus)
387
- else:
388
- gpus = _get_gpus_with_no_grouping(backend_gpus)
389
-
390
- return ListGpusResponse(gpus=gpus)
@@ -39,6 +39,7 @@ from dstack._internal.core.models.profiles import (
39
39
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
40
40
  from dstack._internal.core.models.volumes import Volume
41
41
  from dstack._internal.core.services.profiles import get_termination
42
+ from dstack._internal.server import settings as server_settings
42
43
  from dstack._internal.server.models import (
43
44
  FleetModel,
44
45
  InstanceHealthCheckModel,
@@ -47,9 +48,11 @@ from dstack._internal.server.models import (
47
48
  UserModel,
48
49
  )
49
50
  from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
50
- from dstack._internal.server.schemas.runner import InstanceHealthResponse
51
+ from dstack._internal.server.schemas.runner import InstanceHealthResponse, TaskStatus
52
+ from dstack._internal.server.services.logging import fmt
51
53
  from dstack._internal.server.services.offers import generate_shared_offer
52
54
  from dstack._internal.server.services.projects import list_user_project_models
55
+ from dstack._internal.server.services.runner.client import ShimClient
53
56
  from dstack._internal.utils import common as common_utils
54
57
  from dstack._internal.utils.logging import get_logger
55
58
 
@@ -510,10 +513,10 @@ async def list_active_remote_instances(
510
513
  return instance_models
511
514
 
512
515
 
513
- async def create_instance_model(
516
+ def create_instance_model(
514
517
  session: AsyncSession,
515
518
  project: ProjectModel,
516
- user: UserModel,
519
+ username: str,
517
520
  profile: Profile,
518
521
  requirements: Requirements,
519
522
  instance_name: str,
@@ -533,7 +536,7 @@ async def create_instance_model(
533
536
  instance_config = InstanceConfiguration(
534
537
  project_name=project.name,
535
538
  instance_name=instance_name,
536
- user=user.name,
539
+ user=username,
537
540
  ssh_keys=[project_ssh_key],
538
541
  instance_id=str(instance_id),
539
542
  reservation=reservation,
@@ -633,3 +636,40 @@ async def create_ssh_instance_model(
633
636
  busy_blocks=0,
634
637
  )
635
638
  return im
639
+
640
+
641
+ def remove_dangling_tasks_from_instance(shim_client: ShimClient, instance: InstanceModel) -> None:
642
+ if not shim_client.is_api_v2_supported():
643
+ return
644
+ assigned_to_instance_job_ids = {str(j.id) for j in instance.jobs}
645
+ task_list_response = shim_client.list_tasks()
646
+ tasks: list[tuple[str, Optional[TaskStatus]]]
647
+ if task_list_response.tasks is not None:
648
+ tasks = [(t.id, t.status) for t in task_list_response.tasks]
649
+ elif task_list_response.ids is not None:
650
+ # compatibility with pre-0.19.26 shim
651
+ tasks = [(t_id, None) for t_id in task_list_response.ids]
652
+ else:
653
+ raise ValueError("Unexpected task list response, neither `tasks` nor `ids` is set")
654
+ for task_id, task_status in tasks:
655
+ if task_id in assigned_to_instance_job_ids:
656
+ continue
657
+ should_terminate = task_status != TaskStatus.TERMINATED
658
+ should_remove = not server_settings.SERVER_KEEP_SHIM_TASKS
659
+ if not (should_terminate or should_remove):
660
+ continue
661
+ logger.warning(
662
+ "%s: dangling task found, id=%s, status=%s. Terminating and/or removing",
663
+ fmt(instance),
664
+ task_id,
665
+ task_status or "<unknown>",
666
+ )
667
+ if should_terminate:
668
+ shim_client.terminate_task(
669
+ task_id=task_id,
670
+ reason=None,
671
+ message=None,
672
+ timeout=0,
673
+ )
674
+ if should_remove:
675
+ shim_client.remove_task(task_id=task_id)
@@ -256,7 +256,16 @@ async def process_terminating_job(
256
256
  if jpd is not None:
257
257
  logger.debug("%s: stopping container", fmt(job_model))
258
258
  ssh_private_keys = get_instance_ssh_private_keys(instance_model)
259
- await stop_container(job_model, jpd, ssh_private_keys)
259
+ if not await stop_container(job_model, jpd, ssh_private_keys):
260
+ # The dangling container can be removed later during instance processing
261
+ logger.warning(
262
+ (
263
+ "%s: could not stop container, possibly due to a communication error."
264
+ " See debug logs for details."
265
+ " Ignoring, can attempt to remove the container later"
266
+ ),
267
+ fmt(job_model),
268
+ )
260
269
  if jrd is not None and jrd.volume_names is not None:
261
270
  volume_names = jrd.volume_names
262
271
  else:
@@ -378,21 +387,22 @@ async def stop_container(
378
387
  job_model: JobModel,
379
388
  job_provisioning_data: JobProvisioningData,
380
389
  ssh_private_keys: tuple[str, Optional[str]],
381
- ):
390
+ ) -> bool:
382
391
  if job_provisioning_data.dockerized:
383
392
  # send a request to the shim to terminate the docker container
384
393
  # SSHError and RequestException are caught in the `runner_ssh_tunner` decorator
385
- await run_async(
394
+ return await run_async(
386
395
  _shim_submit_stop,
387
396
  ssh_private_keys,
388
397
  job_provisioning_data,
389
398
  None,
390
399
  job_model,
391
400
  )
401
+ return True
392
402
 
393
403
 
394
404
  @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT])
395
- def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel):
405
+ def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel) -> bool:
396
406
  shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
397
407
 
398
408
  resp = shim_client.healthcheck()
@@ -418,6 +428,7 @@ def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel):
418
428
  shim_client.remove_task(task_id=job_model.id)
419
429
  else:
420
430
  shim_client.stop(force=True)
431
+ return True
421
432
 
422
433
 
423
434
  def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, List[JobModel]]]:
@@ -3,7 +3,7 @@ import sys
3
3
  import threading
4
4
  from abc import ABC, abstractmethod
5
5
  from pathlib import PurePosixPath
6
- from typing import Dict, List, Optional, Union
6
+ from typing import Dict, List, Optional
7
7
 
8
8
  from cachetools import TTLCache, cached
9
9
 
@@ -16,7 +16,7 @@ from dstack._internal.core.models.configurations import (
16
16
  DEFAULT_PROBE_READY_AFTER,
17
17
  DEFAULT_PROBE_TIMEOUT,
18
18
  DEFAULT_PROBE_URL,
19
- DEFAULT_REPO_DIR,
19
+ LEGACY_REPO_DIR,
20
20
  PortMapping,
21
21
  ProbeConfig,
22
22
  PythonVersion,
@@ -45,6 +45,14 @@ from dstack._internal.server.services.docker import ImageConfig, get_image_confi
45
45
  from dstack._internal.utils import crypto
46
46
  from dstack._internal.utils.common import run_async
47
47
  from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
48
+ from dstack._internal.utils.logging import get_logger
49
+ from dstack._internal.utils.path import is_absolute_posix_path
50
+
51
+ logger = get_logger(__name__)
52
+
53
+
54
+ DSTACK_DIR = "/dstack"
55
+ DSTACK_PROFILE_PATH = f"{DSTACK_DIR}/profile"
48
56
 
49
57
 
50
58
  def get_default_python_verison() -> str:
@@ -160,6 +168,7 @@ class JobConfigurator(ABC):
160
168
  ssh_key=self._ssh_key(jobs_per_replica),
161
169
  repo_data=self.run_spec.repo_data,
162
170
  repo_code_hash=self.run_spec.repo_code_hash,
171
+ repo_dir=self._repo_dir(),
163
172
  file_archives=self.run_spec.file_archives,
164
173
  service_port=self._service_port(),
165
174
  probes=self._probes(),
@@ -179,6 +188,7 @@ class JobConfigurator(ABC):
179
188
 
180
189
  async def _commands(self) -> List[str]:
181
190
  if self.run_spec.configuration.entrypoint is not None: # docker-like format
191
+ assert self.run_spec.configuration.type != "dev-environment"
182
192
  entrypoint = shlex.split(self.run_spec.configuration.entrypoint)
183
193
  commands = self.run_spec.configuration.commands
184
194
  elif shell_commands := self._shell_commands():
@@ -208,9 +218,17 @@ class JobConfigurator(ABC):
208
218
  ):
209
219
  return []
210
220
  return [
211
- f"uv venv --python {self._python()} --prompt workflow --seed {DEFAULT_REPO_DIR}/.venv > /dev/null 2>&1",
212
- f"echo 'source {DEFAULT_REPO_DIR}/.venv/bin/activate' >> ~/.bashrc",
213
- f"source {DEFAULT_REPO_DIR}/.venv/bin/activate",
221
+ # `uv` may emit:
222
+ # > warning: `VIRTUAL_ENV=/dstack/venv` does not match the project environment path
223
+ # > `.venv` and will be ignored; use `--active` to target the active environment
224
+ # > instead
225
+ # Safe to ignore, reusing dstack's venv for `uv` is discouraged (it should only be
226
+ # used for legacy `pip`-based configurations). `--no-active` suppresses the warning.
227
+ # Alternatively, the user can call `deactivate` once before using `uv`.
228
+ # If the user really wants to reuse dstack's venv, they must spefify `--active`.
229
+ f"uv venv -q --prompt dstack -p {self._python()} --seed {DSTACK_DIR}/venv",
230
+ f"echo '. {DSTACK_DIR}/venv/bin/activate' >> {DSTACK_PROFILE_PATH}",
231
+ f". {DSTACK_DIR}/venv/bin/activate",
214
232
  ]
215
233
 
216
234
  def _app_specs(self) -> List[AppSpec]:
@@ -258,19 +276,17 @@ class JobConfigurator(ABC):
258
276
  return self.run_spec.configuration.single_branch
259
277
 
260
278
  def _max_duration(self) -> Optional[int]:
261
- if self.run_spec.merged_profile.max_duration in [None, True]:
279
+ if self.run_spec.merged_profile.max_duration is None:
262
280
  return self._default_max_duration()
263
- if self.run_spec.merged_profile.max_duration in ["off", False]:
281
+ if self.run_spec.merged_profile.max_duration == "off":
264
282
  return None
265
- # pydantic validator ensures this is int
266
283
  return self.run_spec.merged_profile.max_duration
267
284
 
268
285
  def _stop_duration(self) -> Optional[int]:
269
- if self.run_spec.merged_profile.stop_duration in [None, True]:
286
+ if self.run_spec.merged_profile.stop_duration is None:
270
287
  return DEFAULT_STOP_DURATION
271
- if self.run_spec.merged_profile.stop_duration in ["off", False]:
288
+ if self.run_spec.merged_profile.stop_duration == "off":
272
289
  return None
273
- # pydantic validator ensures this is int
274
290
  return self.run_spec.merged_profile.stop_duration
275
291
 
276
292
  def _utilization_policy(self) -> Optional[UtilizationPolicy]:
@@ -291,11 +307,34 @@ class JobConfigurator(ABC):
291
307
  def _retry(self) -> Optional[Retry]:
292
308
  return get_retry(self.run_spec.merged_profile)
293
309
 
310
+ def _repo_dir(self) -> str:
311
+ """
312
+ Returns absolute or relative path
313
+ """
314
+ repo_dir = self.run_spec.repo_dir
315
+ if repo_dir is None:
316
+ return LEGACY_REPO_DIR
317
+ return repo_dir
318
+
294
319
  def _working_dir(self) -> Optional[str]:
295
320
  """
296
- None means default working directory
321
+ Returns path or None
322
+
323
+ None means the default working directory taken from the image
324
+
325
+ Currently, for compatibility with pre-0.19.27 runners, the path may be relative.
326
+ Future versions should return only absolute paths
297
327
  """
298
- return self.run_spec.working_dir
328
+ working_dir = self.run_spec.configuration.working_dir
329
+ if working_dir is None:
330
+ return working_dir
331
+ # Return a relative path if possible
332
+ if is_absolute_posix_path(working_dir):
333
+ try:
334
+ return str(PurePosixPath(working_dir).relative_to(LEGACY_REPO_DIR))
335
+ except ValueError:
336
+ pass
337
+ return working_dir
299
338
 
300
339
  def _python(self) -> str:
301
340
  if self.run_spec.configuration.python is not None:
@@ -328,7 +367,7 @@ class JobConfigurator(ABC):
328
367
 
329
368
 
330
369
  def interpolate_job_volumes(
331
- run_volumes: List[Union[MountPoint, str]],
370
+ run_volumes: List[MountPoint],
332
371
  job_num: int,
333
372
  ) -> List[MountPoint]:
334
373
  if len(run_volumes) == 0:
@@ -343,9 +382,6 @@ def interpolate_job_volumes(
343
382
  )
344
383
  job_volumes = []
345
384
  for mount_point in run_volumes:
346
- if isinstance(mount_point, str):
347
- # pydantic validator ensures strings are converted to MountPoint
348
- continue
349
385
  if not isinstance(mount_point, VolumeMountPoint):
350
386
  job_volumes.append(mount_point.copy())
351
387
  continue
@@ -9,8 +9,8 @@ from dstack._internal.server.services.jobs.configurators.extensions.cursor impor
9
9
  from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
10
10
 
11
11
  INSTALL_IPYKERNEL = (
12
- "(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
13
- 'echo "no pip, ipykernel was not installed"'
12
+ "(echo 'pip install ipykernel...' && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
13
+ "echo 'no pip, ipykernel was not installed'"
14
14
  )
15
15
 
16
16
 
@@ -18,6 +18,8 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
18
18
  TYPE: RunConfigurationType = RunConfigurationType.DEV_ENVIRONMENT
19
19
 
20
20
  def __init__(self, run_spec: RunSpec, secrets: Dict[str, str]):
21
+ assert run_spec.configuration.type == "dev-environment"
22
+
21
23
  if run_spec.configuration.ide == "vscode":
22
24
  __class = VSCodeDesktop
23
25
  elif run_spec.configuration.ide == "cursor":
@@ -32,15 +34,17 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
32
34
  super().__init__(run_spec=run_spec, secrets=secrets)
33
35
 
34
36
  def _shell_commands(self) -> List[str]:
37
+ assert self.run_spec.configuration.type == "dev-environment"
38
+
35
39
  commands = self.ide.get_install_commands()
36
40
  commands.append(INSTALL_IPYKERNEL)
37
41
  commands += self.run_spec.configuration.setup
38
- commands.append("echo ''")
42
+ commands.append("echo")
39
43
  commands += self.run_spec.configuration.init
40
44
  commands += self.ide.get_print_readme_commands()
41
45
  commands += [
42
46
  f"echo 'To connect via SSH, use: `ssh {self.run_spec.run_name}`'",
43
- "echo ''",
47
+ "echo",
44
48
  "echo -n 'To exit, press Ctrl+C.'",
45
49
  ]
46
50
  commands += ["tail -f /dev/null"] # idle
@@ -56,4 +60,5 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
56
60
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
57
61
 
58
62
  def _ports(self) -> List[PortMapping]:
63
+ assert self.run_spec.configuration.type == "dev-environment"
59
64
  return self.run_spec.configuration.ports
@@ -1,13 +1,11 @@
1
- from typing import List
2
-
3
- from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
1
+ from typing import List, Optional
4
2
 
5
3
 
6
4
  class CursorDesktop:
7
5
  def __init__(
8
6
  self,
9
- run_name: str,
10
- version: str,
7
+ run_name: Optional[str],
8
+ version: Optional[str],
11
9
  extensions: List[str],
12
10
  ):
13
11
  self.run_name = run_name
@@ -38,7 +36,7 @@ class CursorDesktop:
38
36
  def get_print_readme_commands(self) -> List[str]:
39
37
  return [
40
38
  "echo To open in Cursor, use link below:",
41
- "echo ''",
42
- f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}{DEFAULT_REPO_DIR}'", # TODO use $REPO_DIR
43
- "echo ''",
39
+ "echo",
40
+ f'echo " cursor://vscode-remote/ssh-remote+{self.run_name}$DSTACK_REPO_DIR"',
41
+ "echo",
44
42
  ]
@@ -1,13 +1,11 @@
1
- from typing import List
2
-
3
- from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
1
+ from typing import List, Optional
4
2
 
5
3
 
6
4
  class VSCodeDesktop:
7
5
  def __init__(
8
6
  self,
9
- run_name: str,
10
- version: str,
7
+ run_name: Optional[str],
8
+ version: Optional[str],
11
9
  extensions: List[str],
12
10
  ):
13
11
  self.run_name = run_name
@@ -37,8 +35,8 @@ class VSCodeDesktop:
37
35
 
38
36
  def get_print_readme_commands(self) -> List[str]:
39
37
  return [
40
- "echo To open in VS Code Desktop, use link below:",
41
- "echo ''",
42
- f"echo ' vscode://vscode-remote/ssh-remote+{self.run_name}{DEFAULT_REPO_DIR}'", # TODO use $REPO_DIR
43
- "echo ''",
38
+ "echo 'To open in VS Code Desktop, use link below:'",
39
+ "echo",
40
+ f'echo " vscode://vscode-remote/ssh-remote+{self.run_name}$DSTACK_REPO_DIR"',
41
+ "echo",
44
42
  ]
@@ -9,6 +9,7 @@ class ServiceJobConfigurator(JobConfigurator):
9
9
  TYPE: RunConfigurationType = RunConfigurationType.SERVICE
10
10
 
11
11
  def _shell_commands(self) -> List[str]:
12
+ assert self.run_spec.configuration.type == "service"
12
13
  return self.run_spec.configuration.commands
13
14
 
14
15
  def _default_single_branch(self) -> bool:
@@ -22,6 +23,3 @@ class ServiceJobConfigurator(JobConfigurator):
22
23
 
23
24
  def _ports(self) -> List[PortMapping]:
24
25
  return []
25
-
26
- def _working_dir(self) -> Optional[str]:
27
- return None if not self._shell_commands() else super()._working_dir()
@@ -10,6 +10,7 @@ class TaskJobConfigurator(JobConfigurator):
10
10
  TYPE: RunConfigurationType = RunConfigurationType.TASK
11
11
 
12
12
  async def get_job_specs(self, replica_num: int) -> List[JobSpec]:
13
+ assert self.run_spec.configuration.type == "task"
13
14
  job_specs = []
14
15
  for job_num in range(self.run_spec.configuration.nodes):
15
16
  job_spec = await self._get_job_spec(
@@ -21,6 +22,7 @@ class TaskJobConfigurator(JobConfigurator):
21
22
  return job_specs
22
23
 
23
24
  def _shell_commands(self) -> List[str]:
25
+ assert self.run_spec.configuration.type == "task"
24
26
  return self.run_spec.configuration.commands
25
27
 
26
28
  def _default_single_branch(self) -> bool:
@@ -33,7 +35,5 @@ class TaskJobConfigurator(JobConfigurator):
33
35
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
34
36
 
35
37
  def _ports(self) -> List[PortMapping]:
38
+ assert self.run_spec.configuration.type == "task"
36
39
  return self.run_spec.configuration.ports
37
-
38
- def _working_dir(self) -> Optional[str]:
39
- return None if not self._shell_commands() else super()._working_dir()
@@ -23,13 +23,13 @@ T = TypeVar("T")
23
23
 
24
24
 
25
25
  class Lockset(Protocol[T]):
26
- def __contains__(self, item: T) -> bool: ...
26
+ def __contains__(self, item: T, /) -> bool: ...
27
27
  def __iter__(self) -> Iterator[T]: ...
28
28
  def __len__(self) -> int: ...
29
- def add(self, item: T) -> None: ...
30
- def discard(self, item: T) -> None: ...
31
- def update(self, other: Iterable[T]) -> None: ...
32
- def difference_update(self, other: Iterable[T]) -> None: ...
29
+ def add(self, item: T, /) -> None: ...
30
+ def discard(self, item: T, /) -> None: ...
31
+ def update(self, other: Iterable[T], /) -> None: ...
32
+ def difference_update(self, other: Iterable[T], /) -> None: ...
33
33
 
34
34
 
35
35
  class ResourceLocker: