dstack 0.19.25__py3-none-any.whl → 0.19.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (128) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -2
  2. dstack/_internal/cli/commands/apply.py +3 -61
  3. dstack/_internal/cli/commands/attach.py +1 -1
  4. dstack/_internal/cli/commands/completion.py +1 -1
  5. dstack/_internal/cli/commands/delete.py +2 -2
  6. dstack/_internal/cli/commands/fleet.py +1 -1
  7. dstack/_internal/cli/commands/gateway.py +2 -2
  8. dstack/_internal/cli/commands/init.py +56 -24
  9. dstack/_internal/cli/commands/logs.py +1 -1
  10. dstack/_internal/cli/commands/metrics.py +1 -1
  11. dstack/_internal/cli/commands/offer.py +45 -7
  12. dstack/_internal/cli/commands/project.py +2 -2
  13. dstack/_internal/cli/commands/secrets.py +2 -2
  14. dstack/_internal/cli/commands/server.py +1 -1
  15. dstack/_internal/cli/commands/stop.py +1 -1
  16. dstack/_internal/cli/commands/volume.py +1 -1
  17. dstack/_internal/cli/main.py +2 -2
  18. dstack/_internal/cli/services/completion.py +2 -2
  19. dstack/_internal/cli/services/configurators/__init__.py +6 -2
  20. dstack/_internal/cli/services/configurators/base.py +6 -7
  21. dstack/_internal/cli/services/configurators/fleet.py +1 -3
  22. dstack/_internal/cli/services/configurators/gateway.py +2 -4
  23. dstack/_internal/cli/services/configurators/run.py +195 -55
  24. dstack/_internal/cli/services/configurators/volume.py +2 -4
  25. dstack/_internal/cli/services/profile.py +1 -1
  26. dstack/_internal/cli/services/repos.py +51 -47
  27. dstack/_internal/core/backends/aws/configurator.py +11 -7
  28. dstack/_internal/core/backends/azure/configurator.py +11 -7
  29. dstack/_internal/core/backends/base/configurator.py +25 -13
  30. dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
  31. dstack/_internal/core/backends/cudo/configurator.py +11 -7
  32. dstack/_internal/core/backends/datacrunch/compute.py +5 -1
  33. dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
  34. dstack/_internal/core/backends/gcp/configurator.py +11 -7
  35. dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
  36. dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
  37. dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
  38. dstack/_internal/core/backends/nebius/compute.py +1 -1
  39. dstack/_internal/core/backends/nebius/configurator.py +11 -7
  40. dstack/_internal/core/backends/nebius/resources.py +21 -11
  41. dstack/_internal/core/backends/oci/configurator.py +11 -7
  42. dstack/_internal/core/backends/runpod/configurator.py +11 -7
  43. dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
  44. dstack/_internal/core/backends/tensordock/configurator.py +13 -7
  45. dstack/_internal/core/backends/vastai/configurator.py +11 -7
  46. dstack/_internal/core/backends/vultr/configurator.py +11 -4
  47. dstack/_internal/core/compatibility/gpus.py +13 -0
  48. dstack/_internal/core/compatibility/runs.py +1 -0
  49. dstack/_internal/core/models/common.py +3 -3
  50. dstack/_internal/core/models/configurations.py +172 -27
  51. dstack/_internal/core/models/files.py +1 -1
  52. dstack/_internal/core/models/fleets.py +5 -1
  53. dstack/_internal/core/models/profiles.py +41 -11
  54. dstack/_internal/core/models/resources.py +46 -42
  55. dstack/_internal/core/models/runs.py +4 -0
  56. dstack/_internal/core/services/configs/__init__.py +2 -2
  57. dstack/_internal/core/services/profiles.py +2 -2
  58. dstack/_internal/core/services/repos.py +5 -3
  59. dstack/_internal/core/services/ssh/ports.py +1 -1
  60. dstack/_internal/proxy/lib/deps.py +6 -2
  61. dstack/_internal/server/app.py +22 -17
  62. dstack/_internal/server/background/tasks/process_gateways.py +4 -1
  63. dstack/_internal/server/background/tasks/process_instances.py +10 -2
  64. dstack/_internal/server/background/tasks/process_probes.py +1 -1
  65. dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
  66. dstack/_internal/server/background/tasks/process_runs.py +1 -1
  67. dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
  68. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  69. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  70. dstack/_internal/server/db.py +8 -4
  71. dstack/_internal/server/models.py +1 -0
  72. dstack/_internal/server/routers/gpus.py +1 -6
  73. dstack/_internal/server/schemas/runner.py +10 -0
  74. dstack/_internal/server/services/backends/__init__.py +14 -8
  75. dstack/_internal/server/services/backends/handlers.py +6 -1
  76. dstack/_internal/server/services/docker.py +5 -5
  77. dstack/_internal/server/services/fleets.py +14 -13
  78. dstack/_internal/server/services/gateways/__init__.py +2 -0
  79. dstack/_internal/server/services/gateways/client.py +5 -2
  80. dstack/_internal/server/services/gateways/connection.py +1 -1
  81. dstack/_internal/server/services/gpus.py +50 -49
  82. dstack/_internal/server/services/instances.py +41 -1
  83. dstack/_internal/server/services/jobs/__init__.py +15 -4
  84. dstack/_internal/server/services/jobs/configurators/base.py +7 -11
  85. dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
  86. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
  87. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
  88. dstack/_internal/server/services/jobs/configurators/service.py +1 -0
  89. dstack/_internal/server/services/jobs/configurators/task.py +3 -0
  90. dstack/_internal/server/services/locking.py +5 -5
  91. dstack/_internal/server/services/logging.py +10 -2
  92. dstack/_internal/server/services/logs/__init__.py +8 -6
  93. dstack/_internal/server/services/logs/aws.py +330 -327
  94. dstack/_internal/server/services/logs/filelog.py +7 -6
  95. dstack/_internal/server/services/logs/gcp.py +141 -139
  96. dstack/_internal/server/services/plugins.py +1 -1
  97. dstack/_internal/server/services/projects.py +2 -5
  98. dstack/_internal/server/services/proxy/repo.py +5 -1
  99. dstack/_internal/server/services/requirements/__init__.py +0 -0
  100. dstack/_internal/server/services/requirements/combine.py +259 -0
  101. dstack/_internal/server/services/runner/client.py +7 -0
  102. dstack/_internal/server/services/runs.py +1 -1
  103. dstack/_internal/server/services/services/__init__.py +8 -2
  104. dstack/_internal/server/services/services/autoscalers.py +2 -0
  105. dstack/_internal/server/services/ssh.py +2 -1
  106. dstack/_internal/server/services/storage/__init__.py +5 -6
  107. dstack/_internal/server/services/storage/gcs.py +49 -49
  108. dstack/_internal/server/services/storage/s3.py +52 -52
  109. dstack/_internal/server/statics/index.html +1 -1
  110. dstack/_internal/server/testing/common.py +1 -1
  111. dstack/_internal/server/utils/logging.py +3 -3
  112. dstack/_internal/server/utils/provisioning.py +3 -3
  113. dstack/_internal/utils/json_schema.py +3 -1
  114. dstack/_internal/utils/typing.py +14 -0
  115. dstack/api/_public/repos.py +21 -2
  116. dstack/api/_public/runs.py +5 -7
  117. dstack/api/server/__init__.py +17 -19
  118. dstack/api/server/_gpus.py +2 -1
  119. dstack/api/server/_group.py +4 -3
  120. dstack/api/server/_repos.py +20 -3
  121. dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
  122. dstack/version.py +1 -1
  123. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
  124. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
  125. dstack/api/huggingface/__init__.py +0 -73
  126. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
  127. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
  128. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,8 +1,8 @@
1
1
  from typing import Dict, List, Literal, Optional, Tuple
2
2
 
3
- from sqlalchemy.ext.asyncio import AsyncSession
4
-
5
3
  from dstack._internal.core.backends.base.backend import Backend
4
+ from dstack._internal.core.errors import ServerClientError
5
+ from dstack._internal.core.models.backends.base import BackendType
6
6
  from dstack._internal.core.models.instances import InstanceOfferWithAvailability
7
7
  from dstack._internal.core.models.profiles import SpotPolicy
8
8
  from dstack._internal.core.models.resources import Range
@@ -15,10 +15,43 @@ from dstack._internal.server.schemas.gpus import (
15
15
  ListGpusResponse,
16
16
  )
17
17
  from dstack._internal.server.services.offers import get_offers_by_requirements
18
+ from dstack._internal.utils.common import get_or_error
19
+
20
+
21
+ async def list_gpus_grouped(
22
+ project: ProjectModel,
23
+ run_spec: RunSpec,
24
+ group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
25
+ ) -> ListGpusResponse:
26
+ """Retrieves available GPU specifications based on a run spec, with optional grouping."""
27
+ offers = await _get_gpu_offers(project=project, run_spec=run_spec)
28
+ backend_gpus = _process_offers_into_backend_gpus(offers)
29
+ group_by_set = set(group_by) if group_by else set()
30
+ if "region" in group_by_set and "backend" not in group_by_set:
31
+ raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
32
+
33
+ # Determine grouping strategy based on combination
34
+ has_backend = "backend" in group_by_set
35
+ has_region = "region" in group_by_set
36
+ has_count = "count" in group_by_set
37
+ if has_backend and has_region and has_count:
38
+ gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
39
+ elif has_backend and has_count:
40
+ gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
41
+ elif has_backend and has_region:
42
+ gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
43
+ elif has_backend:
44
+ gpus = _get_gpus_grouped_by_backend(backend_gpus)
45
+ elif has_count:
46
+ gpus = _get_gpus_grouped_by_count(backend_gpus)
47
+ else:
48
+ gpus = _get_gpus_with_no_grouping(backend_gpus)
49
+
50
+ return ListGpusResponse(gpus=gpus)
18
51
 
19
52
 
20
53
  async def _get_gpu_offers(
21
- session: AsyncSession, project: ProjectModel, run_spec: RunSpec
54
+ project: ProjectModel, run_spec: RunSpec
22
55
  ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
23
56
  """Fetches all available instance offers that match the run spec's GPU requirements."""
24
57
  profile = run_spec.merged_profile
@@ -28,7 +61,6 @@ async def _get_gpu_offers(
28
61
  spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
29
62
  reservation=profile.reservation,
30
63
  )
31
-
32
64
  return await get_offers_by_requirements(
33
65
  project=project,
34
66
  profile=profile,
@@ -45,10 +77,10 @@ def _process_offers_into_backend_gpus(
45
77
  offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
46
78
  ) -> List[BackendGpus]:
47
79
  """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
48
- backend_data: Dict[str, Dict] = {}
80
+ backend_data: Dict[BackendType, Dict] = {}
49
81
 
50
- for backend, offer in offers:
51
- backend_type = backend.TYPE
82
+ for _, offer in offers:
83
+ backend_type = offer.backend
52
84
  if backend_type not in backend_data:
53
85
  backend_data[backend_type] = {"gpus": {}, "regions": set()}
54
86
 
@@ -111,7 +143,7 @@ def _process_offers_into_backend_gpus(
111
143
  return backend_gpus_list
112
144
 
113
145
 
114
- def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
146
+ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: BackendType):
115
147
  """Updates an existing GpuGroup with new data from another GPU offer."""
116
148
  spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
117
149
 
@@ -122,6 +154,12 @@ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
122
154
  if row.backends and backend_type not in row.backends:
123
155
  row.backends.append(backend_type)
124
156
 
157
+ # FIXME: Consider using non-optional range
158
+ assert row.count.min is not None
159
+ assert row.count.max is not None
160
+ assert row.price.min is not None
161
+ assert row.price.max is not None
162
+
125
163
  row.count.min = min(row.count.min, gpu.count)
126
164
  row.count.max = max(row.count.max, gpu.count)
127
165
  per_gpu_price = gpu.price / gpu.count
@@ -194,7 +232,7 @@ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGro
194
232
  not any(av.is_available() for av in g.availability),
195
233
  g.price.min,
196
234
  g.price.max,
197
- g.backend.value,
235
+ get_or_error(g.backend).value,
198
236
  g.name,
199
237
  g.memory_mib,
200
238
  ),
@@ -229,7 +267,7 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) ->
229
267
  not any(av.is_available() for av in g.availability),
230
268
  g.price.min,
231
269
  g.price.max,
232
- g.backend.value,
270
+ get_or_error(g.backend).value,
233
271
  g.region,
234
272
  g.name,
235
273
  g.memory_mib,
@@ -299,7 +337,7 @@ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> L
299
337
  not any(av.is_available() for av in g.availability),
300
338
  g.price.min,
301
339
  g.price.max,
302
- g.backend.value,
340
+ get_or_error(g.backend).value,
303
341
  g.count.min,
304
342
  g.name,
305
343
  g.memory_mib,
@@ -344,47 +382,10 @@ def _get_gpus_grouped_by_backend_region_and_count(
344
382
  not any(av.is_available() for av in g.availability),
345
383
  g.price.min,
346
384
  g.price.max,
347
- g.backend.value,
385
+ get_or_error(g.backend).value,
348
386
  g.region,
349
387
  g.count.min,
350
388
  g.name,
351
389
  g.memory_mib,
352
390
  ),
353
391
  )
354
-
355
-
356
- async def list_gpus_grouped(
357
- session: AsyncSession,
358
- project: ProjectModel,
359
- run_spec: RunSpec,
360
- group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
361
- ) -> ListGpusResponse:
362
- """Retrieves available GPU specifications based on a run spec, with optional grouping."""
363
- offers = await _get_gpu_offers(session, project, run_spec)
364
- backend_gpus = _process_offers_into_backend_gpus(offers)
365
-
366
- group_by_set = set(group_by) if group_by else set()
367
-
368
- if "region" in group_by_set and "backend" not in group_by_set:
369
- from dstack._internal.core.errors import ServerClientError
370
-
371
- raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
372
-
373
- # Determine grouping strategy based on combination
374
- has_backend = "backend" in group_by_set
375
- has_region = "region" in group_by_set
376
- has_count = "count" in group_by_set
377
- if has_backend and has_region and has_count:
378
- gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
379
- elif has_backend and has_count:
380
- gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
381
- elif has_backend and has_region:
382
- gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
383
- elif has_backend:
384
- gpus = _get_gpus_grouped_by_backend(backend_gpus)
385
- elif has_count:
386
- gpus = _get_gpus_grouped_by_count(backend_gpus)
387
- else:
388
- gpus = _get_gpus_with_no_grouping(backend_gpus)
389
-
390
- return ListGpusResponse(gpus=gpus)
@@ -39,6 +39,7 @@ from dstack._internal.core.models.profiles import (
39
39
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
40
40
  from dstack._internal.core.models.volumes import Volume
41
41
  from dstack._internal.core.services.profiles import get_termination
42
+ from dstack._internal.server import settings as server_settings
42
43
  from dstack._internal.server.models import (
43
44
  FleetModel,
44
45
  InstanceHealthCheckModel,
@@ -47,9 +48,11 @@ from dstack._internal.server.models import (
47
48
  UserModel,
48
49
  )
49
50
  from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
50
- from dstack._internal.server.schemas.runner import InstanceHealthResponse
51
+ from dstack._internal.server.schemas.runner import InstanceHealthResponse, TaskStatus
52
+ from dstack._internal.server.services.logging import fmt
51
53
  from dstack._internal.server.services.offers import generate_shared_offer
52
54
  from dstack._internal.server.services.projects import list_user_project_models
55
+ from dstack._internal.server.services.runner.client import ShimClient
53
56
  from dstack._internal.utils import common as common_utils
54
57
  from dstack._internal.utils.logging import get_logger
55
58
 
@@ -633,3 +636,40 @@ async def create_ssh_instance_model(
633
636
  busy_blocks=0,
634
637
  )
635
638
  return im
639
+
640
+
641
+ def remove_dangling_tasks_from_instance(shim_client: ShimClient, instance: InstanceModel) -> None:
642
+ if not shim_client.is_api_v2_supported():
643
+ return
644
+ assigned_to_instance_job_ids = {str(j.id) for j in instance.jobs}
645
+ task_list_response = shim_client.list_tasks()
646
+ tasks: list[tuple[str, Optional[TaskStatus]]]
647
+ if task_list_response.tasks is not None:
648
+ tasks = [(t.id, t.status) for t in task_list_response.tasks]
649
+ elif task_list_response.ids is not None:
650
+ # compatibility with pre-0.19.26 shim
651
+ tasks = [(t_id, None) for t_id in task_list_response.ids]
652
+ else:
653
+ raise ValueError("Unexpected task list response, neither `tasks` nor `ids` is set")
654
+ for task_id, task_status in tasks:
655
+ if task_id in assigned_to_instance_job_ids:
656
+ continue
657
+ should_terminate = task_status != TaskStatus.TERMINATED
658
+ should_remove = not server_settings.SERVER_KEEP_SHIM_TASKS
659
+ if not (should_terminate or should_remove):
660
+ continue
661
+ logger.warning(
662
+ "%s: dangling task found, id=%s, status=%s. Terminating and/or removing",
663
+ fmt(instance),
664
+ task_id,
665
+ task_status or "<unknown>",
666
+ )
667
+ if should_terminate:
668
+ shim_client.terminate_task(
669
+ task_id=task_id,
670
+ reason=None,
671
+ message=None,
672
+ timeout=0,
673
+ )
674
+ if should_remove:
675
+ shim_client.remove_task(task_id=task_id)
@@ -256,7 +256,16 @@ async def process_terminating_job(
256
256
  if jpd is not None:
257
257
  logger.debug("%s: stopping container", fmt(job_model))
258
258
  ssh_private_keys = get_instance_ssh_private_keys(instance_model)
259
- await stop_container(job_model, jpd, ssh_private_keys)
259
+ if not await stop_container(job_model, jpd, ssh_private_keys):
260
+ # The dangling container can be removed later during instance processing
261
+ logger.warning(
262
+ (
263
+ "%s: could not stop container, possibly due to a communication error."
264
+ " See debug logs for details."
265
+ " Ignoring, can attempt to remove the container later"
266
+ ),
267
+ fmt(job_model),
268
+ )
260
269
  if jrd is not None and jrd.volume_names is not None:
261
270
  volume_names = jrd.volume_names
262
271
  else:
@@ -378,21 +387,22 @@ async def stop_container(
378
387
  job_model: JobModel,
379
388
  job_provisioning_data: JobProvisioningData,
380
389
  ssh_private_keys: tuple[str, Optional[str]],
381
- ):
390
+ ) -> bool:
382
391
  if job_provisioning_data.dockerized:
383
392
  # send a request to the shim to terminate the docker container
384
393
  # SSHError and RequestException are caught in the `runner_ssh_tunner` decorator
385
- await run_async(
394
+ return await run_async(
386
395
  _shim_submit_stop,
387
396
  ssh_private_keys,
388
397
  job_provisioning_data,
389
398
  None,
390
399
  job_model,
391
400
  )
401
+ return True
392
402
 
393
403
 
394
404
  @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT])
395
- def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel):
405
+ def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel) -> bool:
396
406
  shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
397
407
 
398
408
  resp = shim_client.healthcheck()
@@ -418,6 +428,7 @@ def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel):
418
428
  shim_client.remove_task(task_id=job_model.id)
419
429
  else:
420
430
  shim_client.stop(force=True)
431
+ return True
421
432
 
422
433
 
423
434
  def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, List[JobModel]]]:
@@ -3,7 +3,7 @@ import sys
3
3
  import threading
4
4
  from abc import ABC, abstractmethod
5
5
  from pathlib import PurePosixPath
6
- from typing import Dict, List, Optional, Union
6
+ from typing import Dict, List, Optional
7
7
 
8
8
  from cachetools import TTLCache, cached
9
9
 
@@ -179,6 +179,7 @@ class JobConfigurator(ABC):
179
179
 
180
180
  async def _commands(self) -> List[str]:
181
181
  if self.run_spec.configuration.entrypoint is not None: # docker-like format
182
+ assert self.run_spec.configuration.type != "dev-environment"
182
183
  entrypoint = shlex.split(self.run_spec.configuration.entrypoint)
183
184
  commands = self.run_spec.configuration.commands
184
185
  elif shell_commands := self._shell_commands():
@@ -258,19 +259,17 @@ class JobConfigurator(ABC):
258
259
  return self.run_spec.configuration.single_branch
259
260
 
260
261
  def _max_duration(self) -> Optional[int]:
261
- if self.run_spec.merged_profile.max_duration in [None, True]:
262
+ if self.run_spec.merged_profile.max_duration is None:
262
263
  return self._default_max_duration()
263
- if self.run_spec.merged_profile.max_duration in ["off", False]:
264
+ if self.run_spec.merged_profile.max_duration == "off":
264
265
  return None
265
- # pydantic validator ensures this is int
266
266
  return self.run_spec.merged_profile.max_duration
267
267
 
268
268
  def _stop_duration(self) -> Optional[int]:
269
- if self.run_spec.merged_profile.stop_duration in [None, True]:
269
+ if self.run_spec.merged_profile.stop_duration is None:
270
270
  return DEFAULT_STOP_DURATION
271
- if self.run_spec.merged_profile.stop_duration in ["off", False]:
271
+ if self.run_spec.merged_profile.stop_duration == "off":
272
272
  return None
273
- # pydantic validator ensures this is int
274
273
  return self.run_spec.merged_profile.stop_duration
275
274
 
276
275
  def _utilization_policy(self) -> Optional[UtilizationPolicy]:
@@ -328,7 +327,7 @@ class JobConfigurator(ABC):
328
327
 
329
328
 
330
329
  def interpolate_job_volumes(
331
- run_volumes: List[Union[MountPoint, str]],
330
+ run_volumes: List[MountPoint],
332
331
  job_num: int,
333
332
  ) -> List[MountPoint]:
334
333
  if len(run_volumes) == 0:
@@ -343,9 +342,6 @@ def interpolate_job_volumes(
343
342
  )
344
343
  job_volumes = []
345
344
  for mount_point in run_volumes:
346
- if isinstance(mount_point, str):
347
- # pydantic validator ensures strings are converted to MountPoint
348
- continue
349
345
  if not isinstance(mount_point, VolumeMountPoint):
350
346
  job_volumes.append(mount_point.copy())
351
347
  continue
@@ -18,6 +18,8 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
18
18
  TYPE: RunConfigurationType = RunConfigurationType.DEV_ENVIRONMENT
19
19
 
20
20
  def __init__(self, run_spec: RunSpec, secrets: Dict[str, str]):
21
+ assert run_spec.configuration.type == "dev-environment"
22
+
21
23
  if run_spec.configuration.ide == "vscode":
22
24
  __class = VSCodeDesktop
23
25
  elif run_spec.configuration.ide == "cursor":
@@ -32,6 +34,8 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
32
34
  super().__init__(run_spec=run_spec, secrets=secrets)
33
35
 
34
36
  def _shell_commands(self) -> List[str]:
37
+ assert self.run_spec.configuration.type == "dev-environment"
38
+
35
39
  commands = self.ide.get_install_commands()
36
40
  commands.append(INSTALL_IPYKERNEL)
37
41
  commands += self.run_spec.configuration.setup
@@ -56,4 +60,5 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
56
60
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
57
61
 
58
62
  def _ports(self) -> List[PortMapping]:
63
+ assert self.run_spec.configuration.type == "dev-environment"
59
64
  return self.run_spec.configuration.ports
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
4
4
 
@@ -6,8 +6,8 @@ from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
6
6
  class CursorDesktop:
7
7
  def __init__(
8
8
  self,
9
- run_name: str,
10
- version: str,
9
+ run_name: Optional[str],
10
+ version: Optional[str],
11
11
  extensions: List[str],
12
12
  ):
13
13
  self.run_name = run_name
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
4
4
 
@@ -6,8 +6,8 @@ from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
6
6
  class VSCodeDesktop:
7
7
  def __init__(
8
8
  self,
9
- run_name: str,
10
- version: str,
9
+ run_name: Optional[str],
10
+ version: Optional[str],
11
11
  extensions: List[str],
12
12
  ):
13
13
  self.run_name = run_name
@@ -9,6 +9,7 @@ class ServiceJobConfigurator(JobConfigurator):
9
9
  TYPE: RunConfigurationType = RunConfigurationType.SERVICE
10
10
 
11
11
  def _shell_commands(self) -> List[str]:
12
+ assert self.run_spec.configuration.type == "service"
12
13
  return self.run_spec.configuration.commands
13
14
 
14
15
  def _default_single_branch(self) -> bool:
@@ -10,6 +10,7 @@ class TaskJobConfigurator(JobConfigurator):
10
10
  TYPE: RunConfigurationType = RunConfigurationType.TASK
11
11
 
12
12
  async def get_job_specs(self, replica_num: int) -> List[JobSpec]:
13
+ assert self.run_spec.configuration.type == "task"
13
14
  job_specs = []
14
15
  for job_num in range(self.run_spec.configuration.nodes):
15
16
  job_spec = await self._get_job_spec(
@@ -21,6 +22,7 @@ class TaskJobConfigurator(JobConfigurator):
21
22
  return job_specs
22
23
 
23
24
  def _shell_commands(self) -> List[str]:
25
+ assert self.run_spec.configuration.type == "task"
24
26
  return self.run_spec.configuration.commands
25
27
 
26
28
  def _default_single_branch(self) -> bool:
@@ -33,6 +35,7 @@ class TaskJobConfigurator(JobConfigurator):
33
35
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
34
36
 
35
37
  def _ports(self) -> List[PortMapping]:
38
+ assert self.run_spec.configuration.type == "task"
36
39
  return self.run_spec.configuration.ports
37
40
 
38
41
  def _working_dir(self) -> Optional[str]:
@@ -23,13 +23,13 @@ T = TypeVar("T")
23
23
 
24
24
 
25
25
  class Lockset(Protocol[T]):
26
- def __contains__(self, item: T) -> bool: ...
26
+ def __contains__(self, item: T, /) -> bool: ...
27
27
  def __iter__(self) -> Iterator[T]: ...
28
28
  def __len__(self) -> int: ...
29
- def add(self, item: T) -> None: ...
30
- def discard(self, item: T) -> None: ...
31
- def update(self, other: Iterable[T]) -> None: ...
32
- def difference_update(self, other: Iterable[T]) -> None: ...
29
+ def add(self, item: T, /) -> None: ...
30
+ def discard(self, item: T, /) -> None: ...
31
+ def update(self, other: Iterable[T], /) -> None: ...
32
+ def difference_update(self, other: Iterable[T], /) -> None: ...
33
33
 
34
34
 
35
35
  class ResourceLocker:
@@ -1,14 +1,22 @@
1
1
  from typing import Union
2
2
 
3
- from dstack._internal.server.models import GatewayModel, JobModel, ProbeModel, RunModel
3
+ from dstack._internal.server.models import (
4
+ GatewayModel,
5
+ InstanceModel,
6
+ JobModel,
7
+ ProbeModel,
8
+ RunModel,
9
+ )
4
10
 
5
11
 
6
- def fmt(model: Union[RunModel, JobModel, GatewayModel, ProbeModel]) -> str:
12
+ def fmt(model: Union[RunModel, JobModel, InstanceModel, GatewayModel, ProbeModel]) -> str:
7
13
  """Consistent string representation of a model for logging."""
8
14
  if isinstance(model, RunModel):
9
15
  return f"run({model.id.hex[:6]}){model.run_name}"
10
16
  if isinstance(model, JobModel):
11
17
  return f"job({model.id.hex[:6]}){model.job_name}"
18
+ if isinstance(model, InstanceModel):
19
+ return f"instance({model.id.hex[:6]}){model.name}"
12
20
  if isinstance(model, GatewayModel):
13
21
  return f"gateway({model.id.hex[:6]}){model.name}"
14
22
  if isinstance(model, ProbeModel):
@@ -7,14 +7,14 @@ from dstack._internal.server import settings
7
7
  from dstack._internal.server.models import ProjectModel
8
8
  from dstack._internal.server.schemas.logs import PollLogsRequest
9
9
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
10
- from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
10
+ from dstack._internal.server.services.logs import aws as aws_logs
11
+ from dstack._internal.server.services.logs import gcp as gcp_logs
11
12
  from dstack._internal.server.services.logs.base import (
12
13
  LogStorage,
13
14
  LogStorageError,
14
15
  b64encode_raw_message,
15
16
  )
16
17
  from dstack._internal.server.services.logs.filelog import FileLogStorage
17
- from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
18
18
  from dstack._internal.utils.common import run_async
19
19
  from dstack._internal.utils.logging import get_logger
20
20
 
@@ -29,9 +29,9 @@ def get_log_storage() -> LogStorage:
29
29
  if _log_storage is not None:
30
30
  return _log_storage
31
31
  if settings.SERVER_CLOUDWATCH_LOG_GROUP:
32
- if BOTO_AVAILABLE:
32
+ if aws_logs.BOTO_AVAILABLE:
33
33
  try:
34
- _log_storage = CloudWatchLogStorage(
34
+ _log_storage = aws_logs.CloudWatchLogStorage(
35
35
  group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
36
36
  region=settings.SERVER_CLOUDWATCH_LOG_REGION,
37
37
  )
@@ -44,9 +44,11 @@ def get_log_storage() -> LogStorage:
44
44
  else:
45
45
  logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
46
46
  elif settings.SERVER_GCP_LOGGING_PROJECT:
47
- if GCP_LOGGING_AVAILABLE:
47
+ if gcp_logs.GCP_LOGGING_AVAILABLE:
48
48
  try:
49
- _log_storage = GCPLogStorage(project_id=settings.SERVER_GCP_LOGGING_PROJECT)
49
+ _log_storage = gcp_logs.GCPLogStorage(
50
+ project_id=settings.SERVER_GCP_LOGGING_PROJECT
51
+ )
50
52
  except LogStorageError as e:
51
53
  logger.error("Failed to initialize GCP Logs storage: %s", e)
52
54
  except Exception: