dstack 0.19.25__py3-none-any.whl → 0.19.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (128) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -2
  2. dstack/_internal/cli/commands/apply.py +3 -61
  3. dstack/_internal/cli/commands/attach.py +1 -1
  4. dstack/_internal/cli/commands/completion.py +1 -1
  5. dstack/_internal/cli/commands/delete.py +2 -2
  6. dstack/_internal/cli/commands/fleet.py +1 -1
  7. dstack/_internal/cli/commands/gateway.py +2 -2
  8. dstack/_internal/cli/commands/init.py +56 -24
  9. dstack/_internal/cli/commands/logs.py +1 -1
  10. dstack/_internal/cli/commands/metrics.py +1 -1
  11. dstack/_internal/cli/commands/offer.py +45 -7
  12. dstack/_internal/cli/commands/project.py +2 -2
  13. dstack/_internal/cli/commands/secrets.py +2 -2
  14. dstack/_internal/cli/commands/server.py +1 -1
  15. dstack/_internal/cli/commands/stop.py +1 -1
  16. dstack/_internal/cli/commands/volume.py +1 -1
  17. dstack/_internal/cli/main.py +2 -2
  18. dstack/_internal/cli/services/completion.py +2 -2
  19. dstack/_internal/cli/services/configurators/__init__.py +6 -2
  20. dstack/_internal/cli/services/configurators/base.py +6 -7
  21. dstack/_internal/cli/services/configurators/fleet.py +1 -3
  22. dstack/_internal/cli/services/configurators/gateway.py +2 -4
  23. dstack/_internal/cli/services/configurators/run.py +195 -55
  24. dstack/_internal/cli/services/configurators/volume.py +2 -4
  25. dstack/_internal/cli/services/profile.py +1 -1
  26. dstack/_internal/cli/services/repos.py +51 -47
  27. dstack/_internal/core/backends/aws/configurator.py +11 -7
  28. dstack/_internal/core/backends/azure/configurator.py +11 -7
  29. dstack/_internal/core/backends/base/configurator.py +25 -13
  30. dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
  31. dstack/_internal/core/backends/cudo/configurator.py +11 -7
  32. dstack/_internal/core/backends/datacrunch/compute.py +5 -1
  33. dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
  34. dstack/_internal/core/backends/gcp/configurator.py +11 -7
  35. dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
  36. dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
  37. dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
  38. dstack/_internal/core/backends/nebius/compute.py +1 -1
  39. dstack/_internal/core/backends/nebius/configurator.py +11 -7
  40. dstack/_internal/core/backends/nebius/resources.py +21 -11
  41. dstack/_internal/core/backends/oci/configurator.py +11 -7
  42. dstack/_internal/core/backends/runpod/configurator.py +11 -7
  43. dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
  44. dstack/_internal/core/backends/tensordock/configurator.py +13 -7
  45. dstack/_internal/core/backends/vastai/configurator.py +11 -7
  46. dstack/_internal/core/backends/vultr/configurator.py +11 -4
  47. dstack/_internal/core/compatibility/gpus.py +13 -0
  48. dstack/_internal/core/compatibility/runs.py +1 -0
  49. dstack/_internal/core/models/common.py +3 -3
  50. dstack/_internal/core/models/configurations.py +172 -27
  51. dstack/_internal/core/models/files.py +1 -1
  52. dstack/_internal/core/models/fleets.py +5 -1
  53. dstack/_internal/core/models/profiles.py +41 -11
  54. dstack/_internal/core/models/resources.py +46 -42
  55. dstack/_internal/core/models/runs.py +4 -0
  56. dstack/_internal/core/services/configs/__init__.py +2 -2
  57. dstack/_internal/core/services/profiles.py +2 -2
  58. dstack/_internal/core/services/repos.py +5 -3
  59. dstack/_internal/core/services/ssh/ports.py +1 -1
  60. dstack/_internal/proxy/lib/deps.py +6 -2
  61. dstack/_internal/server/app.py +22 -17
  62. dstack/_internal/server/background/tasks/process_gateways.py +4 -1
  63. dstack/_internal/server/background/tasks/process_instances.py +10 -2
  64. dstack/_internal/server/background/tasks/process_probes.py +1 -1
  65. dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
  66. dstack/_internal/server/background/tasks/process_runs.py +1 -1
  67. dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
  68. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  69. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  70. dstack/_internal/server/db.py +8 -4
  71. dstack/_internal/server/models.py +1 -0
  72. dstack/_internal/server/routers/gpus.py +1 -6
  73. dstack/_internal/server/schemas/runner.py +10 -0
  74. dstack/_internal/server/services/backends/__init__.py +14 -8
  75. dstack/_internal/server/services/backends/handlers.py +6 -1
  76. dstack/_internal/server/services/docker.py +5 -5
  77. dstack/_internal/server/services/fleets.py +14 -13
  78. dstack/_internal/server/services/gateways/__init__.py +2 -0
  79. dstack/_internal/server/services/gateways/client.py +5 -2
  80. dstack/_internal/server/services/gateways/connection.py +1 -1
  81. dstack/_internal/server/services/gpus.py +50 -49
  82. dstack/_internal/server/services/instances.py +41 -1
  83. dstack/_internal/server/services/jobs/__init__.py +15 -4
  84. dstack/_internal/server/services/jobs/configurators/base.py +7 -11
  85. dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
  86. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
  87. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
  88. dstack/_internal/server/services/jobs/configurators/service.py +1 -0
  89. dstack/_internal/server/services/jobs/configurators/task.py +3 -0
  90. dstack/_internal/server/services/locking.py +5 -5
  91. dstack/_internal/server/services/logging.py +10 -2
  92. dstack/_internal/server/services/logs/__init__.py +8 -6
  93. dstack/_internal/server/services/logs/aws.py +330 -327
  94. dstack/_internal/server/services/logs/filelog.py +7 -6
  95. dstack/_internal/server/services/logs/gcp.py +141 -139
  96. dstack/_internal/server/services/plugins.py +1 -1
  97. dstack/_internal/server/services/projects.py +2 -5
  98. dstack/_internal/server/services/proxy/repo.py +5 -1
  99. dstack/_internal/server/services/requirements/__init__.py +0 -0
  100. dstack/_internal/server/services/requirements/combine.py +259 -0
  101. dstack/_internal/server/services/runner/client.py +7 -0
  102. dstack/_internal/server/services/runs.py +1 -1
  103. dstack/_internal/server/services/services/__init__.py +8 -2
  104. dstack/_internal/server/services/services/autoscalers.py +2 -0
  105. dstack/_internal/server/services/ssh.py +2 -1
  106. dstack/_internal/server/services/storage/__init__.py +5 -6
  107. dstack/_internal/server/services/storage/gcs.py +49 -49
  108. dstack/_internal/server/services/storage/s3.py +52 -52
  109. dstack/_internal/server/statics/index.html +1 -1
  110. dstack/_internal/server/testing/common.py +1 -1
  111. dstack/_internal/server/utils/logging.py +3 -3
  112. dstack/_internal/server/utils/provisioning.py +3 -3
  113. dstack/_internal/utils/json_schema.py +3 -1
  114. dstack/_internal/utils/typing.py +14 -0
  115. dstack/api/_public/repos.py +21 -2
  116. dstack/api/_public/runs.py +5 -7
  117. dstack/api/server/__init__.py +17 -19
  118. dstack/api/server/_gpus.py +2 -1
  119. dstack/api/server/_group.py +4 -3
  120. dstack/api/server/_repos.py +20 -3
  121. dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
  122. dstack/version.py +1 -1
  123. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
  124. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
  125. dstack/api/huggingface/__init__.py +0 -73
  126. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
  127. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
  128. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
@@ -130,6 +130,12 @@ DEFAULT_GPU_COUNT = Range[int](min=1)
130
130
 
131
131
 
132
132
  class CPUSpec(CoreModel):
133
+ arch: Annotated[
134
+ Optional[gpuhunt.CPUArchitecture],
135
+ Field(description="The CPU architecture, one of: `x86`, `arm`"),
136
+ ] = None
137
+ count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
138
+
133
139
  class Config(CoreModel.Config):
134
140
  @staticmethod
135
141
  def schema_extra(schema: Dict[str, Any]):
@@ -138,12 +144,6 @@ class CPUSpec(CoreModel):
138
144
  extra_types=[{"type": "integer"}, {"type": "string"}],
139
145
  )
140
146
 
141
- arch: Annotated[
142
- Optional[gpuhunt.CPUArchitecture],
143
- Field(description="The CPU architecture, one of: `x86`, `arm`"),
144
- ] = None
145
- count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
146
-
147
147
  @classmethod
148
148
  def __get_validators__(cls):
149
149
  yield cls.parse
@@ -191,22 +191,6 @@ class CPUSpec(CoreModel):
191
191
 
192
192
 
193
193
  class GPUSpec(CoreModel):
194
- class Config(CoreModel.Config):
195
- @staticmethod
196
- def schema_extra(schema: Dict[str, Any]):
197
- add_extra_schema_types(
198
- schema["properties"]["count"],
199
- extra_types=[{"type": "integer"}, {"type": "string"}],
200
- )
201
- add_extra_schema_types(
202
- schema["properties"]["memory"],
203
- extra_types=[{"type": "integer"}, {"type": "string"}],
204
- )
205
- add_extra_schema_types(
206
- schema["properties"]["total_memory"],
207
- extra_types=[{"type": "integer"}, {"type": "string"}],
208
- )
209
-
210
194
  vendor: Annotated[
211
195
  Optional[gpuhunt.AcceleratorVendor],
212
196
  Field(
@@ -234,6 +218,26 @@ class GPUSpec(CoreModel):
234
218
  Field(description="The minimum compute capability of the GPU (e.g., `7.5`)"),
235
219
  ] = None
236
220
 
221
+ class Config(CoreModel.Config):
222
+ @staticmethod
223
+ def schema_extra(schema: Dict[str, Any]):
224
+ add_extra_schema_types(
225
+ schema["properties"]["count"],
226
+ extra_types=[{"type": "integer"}, {"type": "string"}],
227
+ )
228
+ add_extra_schema_types(
229
+ schema["properties"]["name"],
230
+ extra_types=[{"type": "string"}],
231
+ )
232
+ add_extra_schema_types(
233
+ schema["properties"]["memory"],
234
+ extra_types=[{"type": "integer"}, {"type": "string"}],
235
+ )
236
+ add_extra_schema_types(
237
+ schema["properties"]["total_memory"],
238
+ extra_types=[{"type": "integer"}, {"type": "string"}],
239
+ )
240
+
237
241
  @classmethod
238
242
  def __get_validators__(cls):
239
243
  yield cls.parse
@@ -314,6 +318,8 @@ class GPUSpec(CoreModel):
314
318
 
315
319
 
316
320
  class DiskSpec(CoreModel):
321
+ size: Annotated[Range[Memory], Field(description="Disk size")]
322
+
317
323
  class Config(CoreModel.Config):
318
324
  @staticmethod
319
325
  def schema_extra(schema: Dict[str, Any]):
@@ -322,8 +328,6 @@ class DiskSpec(CoreModel):
322
328
  extra_types=[{"type": "integer"}, {"type": "string"}],
323
329
  )
324
330
 
325
- size: Annotated[Range[Memory], Field(description="Disk size")]
326
-
327
331
  @classmethod
328
332
  def __get_validators__(cls):
329
333
  yield cls._parse
@@ -340,6 +344,24 @@ DEFAULT_DISK = DiskSpec(size=Range[Memory](min=Memory.parse("100GB"), max=None))
340
344
 
341
345
 
342
346
  class ResourcesSpec(CoreModel):
347
+ # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
348
+ cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
349
+ CPUSpec()
350
+ )
351
+ memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
352
+ DEFAULT_MEMORY_SIZE
353
+ )
354
+ shm_size: Annotated[
355
+ Optional[Memory],
356
+ Field(
357
+ description="The size of shared memory (e.g., `8GB`). "
358
+ "If you are using parallel communicating processes (e.g., dataloaders in PyTorch), "
359
+ "you may need to configure this"
360
+ ),
361
+ ] = None
362
+ gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
363
+ disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
364
+
343
365
  class Config(CoreModel.Config):
344
366
  @staticmethod
345
367
  def schema_extra(schema: Dict[str, Any]):
@@ -364,24 +386,6 @@ class ResourcesSpec(CoreModel):
364
386
  extra_types=[{"type": "integer"}, {"type": "string"}],
365
387
  )
366
388
 
367
- # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
368
- cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
369
- CPUSpec()
370
- )
371
- memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
372
- DEFAULT_MEMORY_SIZE
373
- )
374
- shm_size: Annotated[
375
- Optional[Memory],
376
- Field(
377
- description="The size of shared memory (e.g., `8GB`). "
378
- "If you are using parallel communicating processes (e.g., dataloaders in PyTorch), "
379
- "you may need to configure this"
380
- ),
381
- ] = None
382
- gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
383
- disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
384
-
385
389
  def pretty_format(self) -> str:
386
390
  # TODO: Remove in 0.20. Use self.cpu directly
387
391
  cpu = parse_obj_as(CPUSpec, self.cpu)
@@ -1,6 +1,7 @@
1
1
  from datetime import datetime, timedelta
2
2
  from enum import Enum
3
3
  from typing import Any, Dict, List, Literal, Optional, Type
4
+ from urllib.parse import urlparse
4
5
 
5
6
  from pydantic import UUID4, Field, root_validator
6
7
  from typing_extensions import Annotated
@@ -483,6 +484,9 @@ class ServiceSpec(CoreModel):
483
484
  model: Optional[ServiceModelSpec] = None
484
485
  options: Dict[str, Any] = {}
485
486
 
487
+ def get_domain(self) -> Optional[str]:
488
+ return urlparse(self.url).hostname
489
+
486
490
 
487
491
  class RunStatus(str, Enum):
488
492
  PENDING = "pending"
@@ -68,8 +68,8 @@ class ConfigManager:
68
68
  if len(self.config.projects) == 1:
69
69
  self.config.projects[0].default = True
70
70
 
71
- def list_projects(self):
72
- return [project.name for project in self.config.projects]
71
+ def list_project_configs(self) -> list[ProjectConfig]:
72
+ return self.config.projects
73
73
 
74
74
  def delete_project(self, name: str):
75
75
  self.config.projects = [p for p in self.config.projects if p.name != name]
@@ -37,10 +37,10 @@ def get_termination(
37
37
  ) -> Tuple[TerminationPolicy, int]:
38
38
  termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
39
39
  termination_idle_time = default_termination_idle_time
40
- if profile.idle_duration is not None and int(profile.idle_duration) < 0:
40
+ if profile.idle_duration is not None and profile.idle_duration < 0:
41
41
  termination_policy = TerminationPolicy.DONT_DESTROY
42
42
  elif profile.idle_duration is not None:
43
43
  termination_idle_time = profile.idle_duration
44
44
  if termination_policy == TerminationPolicy.DONT_DESTROY:
45
45
  termination_idle_time = -1
46
- return termination_policy, int(termination_idle_time)
46
+ return termination_policy, termination_idle_time
@@ -2,7 +2,7 @@ import os
2
2
  from pathlib import Path
3
3
  from typing import Optional, Union
4
4
 
5
- import git
5
+ import git.cmd
6
6
  import requests
7
7
  import yaml
8
8
  from git.exc import GitCommandError
@@ -24,6 +24,8 @@ logger = get_logger(__name__)
24
24
  gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml")
25
25
  default_ssh_key = os.path.expanduser("~/.ssh/id_rsa")
26
26
 
27
+ no_prompt_env = dict(GIT_TERMINAL_PROMPT="0")
28
+
27
29
 
28
30
  class InvalidRepoCredentialsError(DstackError):
29
31
  pass
@@ -84,7 +86,7 @@ def get_local_repo_credentials(
84
86
 
85
87
  def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> RemoteRepoCreds:
86
88
  try:
87
- git.cmd.Git().ls_remote(url.as_https(oauth_token), env=dict(GIT_TERMINAL_PROMPT="0"))
89
+ git.cmd.Git().ls_remote(url.as_https(oauth_token), env=no_prompt_env)
88
90
  except GitCommandError:
89
91
  masked = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
90
92
  raise InvalidRepoCredentialsError(
@@ -131,7 +133,7 @@ def get_default_branch(remote_url: str) -> Optional[str]:
131
133
  Get the default branch of a remote Git repository.
132
134
  """
133
135
  try:
134
- output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD")
136
+ output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD", env=no_prompt_env)
135
137
  for line in output.splitlines():
136
138
  if line.startswith("ref:"):
137
139
  return line.split()[1].split("/")[-1]
@@ -74,7 +74,7 @@ class PortsLock:
74
74
  try:
75
75
  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
76
76
  if IS_WINDOWS:
77
- sock.setsockopt(socket.SOL_SOCKET, socket.SO_EXCLUSIVEADDRUSE, 1)
77
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_EXCLUSIVEADDRUSE, 1) # type: ignore[attr-defined]
78
78
  sock.bind(("", port))
79
79
  return sock
80
80
  except socket.error as e:
@@ -21,12 +21,16 @@ class ProxyDependencyInjector(ABC):
21
21
  def __init__(self) -> None:
22
22
  self._service_conn_pool = ServiceConnectionPool()
23
23
 
24
+ # Abstract AsyncGenerator does not need async def since
25
+ # type checkers infer a different type without yield in body.
26
+ # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
27
+
24
28
  @abstractmethod
25
- async def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]:
29
+ def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]:
26
30
  pass
27
31
 
28
32
  @abstractmethod
29
- async def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]:
33
+ def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]:
30
34
  pass
31
35
 
32
36
  async def get_service_connection_pool(self) -> ServiceConnectionPool:
@@ -110,9 +110,11 @@ async def lifespan(app: FastAPI):
110
110
  _print_dstack_logo()
111
111
  if not check_required_ssh_version():
112
112
  logger.warning("OpenSSH 8.4+ is required. The dstack server may not work properly")
113
+ server_config_manager = None
114
+ server_config_loaded = False
113
115
  if settings.SERVER_CONFIG_ENABLED:
114
116
  server_config_manager = ServerConfigManager()
115
- config_loaded = server_config_manager.load_config()
117
+ server_config_loaded = server_config_manager.load_config()
116
118
  # Encryption has to be configured before working with users and projects
117
119
  await server_config_manager.apply_encryption()
118
120
  async with get_session_ctx() as session:
@@ -126,11 +128,9 @@ async def lifespan(app: FastAPI):
126
128
  session=session,
127
129
  user=admin,
128
130
  )
129
- if settings.SERVER_CONFIG_ENABLED:
130
- server_config_dir = str(SERVER_CONFIG_FILE_PATH).replace(
131
- os.path.expanduser("~"), "~", 1
132
- )
133
- if not config_loaded:
131
+ if server_config_manager is not None:
132
+ server_config_dir = _get_server_config_dir()
133
+ if not server_config_loaded:
134
134
  logger.info("Initializing the default configuration...", {"show_path": False})
135
135
  await server_config_manager.init_config(session=session)
136
136
  logger.info(
@@ -153,6 +153,7 @@ async def lifespan(app: FastAPI):
153
153
  )
154
154
  if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
155
155
  init_default_storage()
156
+ scheduler = None
156
157
  if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
157
158
  scheduler = start_background_tasks()
158
159
  else:
@@ -167,7 +168,7 @@ async def lifespan(app: FastAPI):
167
168
  for func in _ON_STARTUP_HOOKS:
168
169
  await func(app)
169
170
  yield
170
- if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
171
+ if scheduler is not None:
171
172
  scheduler.shutdown()
172
173
  PROBES_SCHEDULER.shutdown(wait=False)
173
174
  await gateway_connections_pool.remove_all()
@@ -371,6 +372,18 @@ def _is_prometheus_request(request: Request) -> bool:
371
372
  return request.url.path.startswith("/metrics")
372
373
 
373
374
 
375
+ def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
376
+ parent_sampling_decision = sampling_context["parent_sampled"]
377
+ if parent_sampling_decision is not None:
378
+ return float(parent_sampling_decision)
379
+ transaction_context = sampling_context["transaction_context"]
380
+ name = transaction_context.get("name")
381
+ if name is not None:
382
+ if name.startswith("background."):
383
+ return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
384
+ return settings.SENTRY_TRACES_SAMPLE_RATE
385
+
386
+
374
387
  def _print_dstack_logo():
375
388
  console.print(
376
389
  """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
@@ -387,13 +400,5 @@ def _print_dstack_logo():
387
400
  )
388
401
 
389
402
 
390
- def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
391
- parent_sampling_decision = sampling_context["parent_sampled"]
392
- if parent_sampling_decision is not None:
393
- return float(parent_sampling_decision)
394
- transaction_context = sampling_context["transaction_context"]
395
- name = transaction_context.get("name")
396
- if name is not None:
397
- if name.startswith("background."):
398
- return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
399
- return settings.SENTRY_TRACES_SAMPLE_RATE
403
+ def _get_server_config_dir() -> str:
404
+ return str(SERVER_CONFIG_FILE_PATH).replace(os.path.expanduser("~"), "~", 1)
@@ -49,8 +49,8 @@ async def process_gateways():
49
49
  if gateway_model is None:
50
50
  return
51
51
  lockset.add(gateway_model.id)
52
+ gateway_model_id = gateway_model.id
52
53
  try:
53
- gateway_model_id = gateway_model.id
54
54
  initial_status = gateway_model.status
55
55
  if initial_status == GatewayStatus.SUBMITTED:
56
56
  await _process_submitted_gateway(session=session, gateway_model=gateway_model)
@@ -165,6 +165,9 @@ async def _process_provisioning_gateway(
165
165
  )
166
166
  gateway_model = res.unique().scalar_one()
167
167
 
168
+ # Provisioning gateways must have compute.
169
+ assert gateway_model.gateway_compute is not None
170
+
168
171
  # FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
169
172
  # - cannot delete the gateway before it is provisioned because the DB model is locked
170
173
  # - connection retry counter is reset on server restart
@@ -85,8 +85,10 @@ from dstack._internal.server.services.instances import (
85
85
  get_instance_provisioning_data,
86
86
  get_instance_requirements,
87
87
  get_instance_ssh_private_keys,
88
+ remove_dangling_tasks_from_instance,
88
89
  )
89
90
  from dstack._internal.server.services.locking import get_locker
91
+ from dstack._internal.server.services.logging import fmt
90
92
  from dstack._internal.server.services.offers import is_divisible_into_blocks
91
93
  from dstack._internal.server.services.placement import (
92
94
  get_fleet_placement_group_models,
@@ -181,8 +183,8 @@ async def _process_next_instance():
181
183
  if instance is None:
182
184
  return
183
185
  lockset.add(instance.id)
186
+ instance_model_id = instance.id
184
187
  try:
185
- instance_model_id = instance.id
186
188
  await _process_instance(session=session, instance=instance)
187
189
  finally:
188
190
  lockset.difference_update([instance_model_id])
@@ -393,6 +395,7 @@ async def _add_remote(instance: InstanceModel) -> None:
393
395
  return
394
396
 
395
397
  region = instance.region
398
+ assert region is not None # always set for ssh instances
396
399
  jpd = JobProvisioningData(
397
400
  backend=BackendType.REMOTE,
398
401
  instance_type=instance_type,
@@ -788,6 +791,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
788
791
  ssh_private_keys,
789
792
  job_provisioning_data,
790
793
  None,
794
+ instance=instance,
791
795
  check_instance_health=check_instance_health,
792
796
  )
793
797
  if instance_check is False:
@@ -934,7 +938,7 @@ async def _wait_for_instance_provisioning_data(
934
938
 
935
939
  @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
936
940
  def _check_instance_inner(
937
- ports: Dict[int, int], *, check_instance_health: bool = False
941
+ ports: Dict[int, int], *, instance: InstanceModel, check_instance_health: bool = False
938
942
  ) -> InstanceCheck:
939
943
  instance_health_response: Optional[InstanceHealthResponse] = None
940
944
  shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
@@ -954,6 +958,10 @@ def _check_instance_inner(
954
958
  args = (method.__func__.__name__, e.__class__.__name__, e)
955
959
  logger.exception(template, *args)
956
960
  return InstanceCheck(reachable=False, message=template % args)
961
+ try:
962
+ remove_dangling_tasks_from_instance(shim_client, instance)
963
+ except Exception as e:
964
+ logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e)
957
965
  return runner_client.healthcheck_response_to_instance_check(
958
966
  healthcheck_response, instance_health_response
959
967
  )
@@ -120,7 +120,7 @@ async def _execute_probe(probe: ProbeModel, probe_spec: ProbeSpec) -> bool:
120
120
  method=probe_spec.method,
121
121
  url="http://dstack" + probe_spec.url,
122
122
  headers=[(h.name, h.value) for h in probe_spec.headers],
123
- data=probe_spec.body,
123
+ content=probe_spec.body,
124
124
  timeout=probe_spec.timeout,
125
125
  follow_redirects=False,
126
126
  )
@@ -128,9 +128,8 @@ async def _process_next_running_job():
128
128
  if job_model is None:
129
129
  return
130
130
  lockset.add(job_model.id)
131
-
131
+ job_model_id = job_model.id
132
132
  try:
133
- job_model_id = job_model.id
134
133
  await _process_running_job(session=session, job_model=job_model)
135
134
  finally:
136
135
  lockset.difference_update([job_model_id])
@@ -170,6 +169,11 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
170
169
 
171
170
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
172
171
 
172
+ volumes = []
173
+ secrets = {}
174
+ cluster_info = None
175
+ repo_creds = None
176
+
173
177
  initial_status = job_model.status
174
178
  if initial_status in [JobStatus.PROVISIONING, JobStatus.PULLING]:
175
179
  # Wait until all other jobs in the replica are provisioned
@@ -257,6 +261,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
257
261
  user_ssh_key,
258
262
  )
259
263
  else:
264
+ assert cluster_info is not None
260
265
  logger.debug(
261
266
  "%s: process provisioning job without shim, age=%s",
262
267
  fmt(job_model),
@@ -275,7 +280,6 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
275
280
  repo=repo_model,
276
281
  code_hash=_get_repo_code_hash(run, job),
277
282
  )
278
-
279
283
  success = await common_utils.run_async(
280
284
  _submit_job_to_runner,
281
285
  server_ssh_private_keys,
@@ -309,6 +313,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
309
313
 
310
314
  else: # fails are not acceptable
311
315
  if initial_status == JobStatus.PULLING:
316
+ assert cluster_info is not None
312
317
  logger.debug(
313
318
  "%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
314
319
  )
@@ -341,7 +346,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
341
346
  server_ssh_private_keys,
342
347
  job_provisioning_data,
343
348
  )
344
- elif initial_status == JobStatus.RUNNING:
349
+ else:
345
350
  logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
346
351
  success = await common_utils.run_async(
347
352
  _process_running,
@@ -632,6 +637,7 @@ def _process_pulling_with_shim(
632
637
  is successful
633
638
  """
634
639
  shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
640
+ job_runtime_data = None
635
641
  if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
636
642
  task = shim_client.get_task(job_model.id)
637
643
 
@@ -129,8 +129,8 @@ async def _process_next_run():
129
129
  job_ids = [j.id for j in run_model.jobs]
130
130
  run_lockset.add(run_model.id)
131
131
  job_lockset.update(job_ids)
132
+ run_model_id = run_model.id
132
133
  try:
133
- run_model_id = run_model.id
134
134
  await _process_run(session=session, run_model=run_model)
135
135
  finally:
136
136
  run_lockset.difference_update([run_model_id])