dstack 0.19.25__py3-none-any.whl → 0.19.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -2
- dstack/_internal/cli/commands/apply.py +3 -61
- dstack/_internal/cli/commands/attach.py +1 -1
- dstack/_internal/cli/commands/completion.py +1 -1
- dstack/_internal/cli/commands/delete.py +2 -2
- dstack/_internal/cli/commands/fleet.py +1 -1
- dstack/_internal/cli/commands/gateway.py +2 -2
- dstack/_internal/cli/commands/init.py +56 -24
- dstack/_internal/cli/commands/logs.py +1 -1
- dstack/_internal/cli/commands/metrics.py +1 -1
- dstack/_internal/cli/commands/offer.py +45 -7
- dstack/_internal/cli/commands/project.py +2 -2
- dstack/_internal/cli/commands/secrets.py +2 -2
- dstack/_internal/cli/commands/server.py +1 -1
- dstack/_internal/cli/commands/stop.py +1 -1
- dstack/_internal/cli/commands/volume.py +1 -1
- dstack/_internal/cli/main.py +2 -2
- dstack/_internal/cli/services/completion.py +2 -2
- dstack/_internal/cli/services/configurators/__init__.py +6 -2
- dstack/_internal/cli/services/configurators/base.py +6 -7
- dstack/_internal/cli/services/configurators/fleet.py +1 -3
- dstack/_internal/cli/services/configurators/gateway.py +2 -4
- dstack/_internal/cli/services/configurators/run.py +195 -55
- dstack/_internal/cli/services/configurators/volume.py +2 -4
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/cli/services/repos.py +51 -47
- dstack/_internal/core/backends/aws/configurator.py +11 -7
- dstack/_internal/core/backends/azure/configurator.py +11 -7
- dstack/_internal/core/backends/base/configurator.py +25 -13
- dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
- dstack/_internal/core/backends/cudo/configurator.py +11 -7
- dstack/_internal/core/backends/datacrunch/compute.py +5 -1
- dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
- dstack/_internal/core/backends/gcp/configurator.py +11 -7
- dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
- dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
- dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/compute.py +1 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/resources.py +21 -11
- dstack/_internal/core/backends/oci/configurator.py +11 -7
- dstack/_internal/core/backends/runpod/configurator.py +11 -7
- dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
- dstack/_internal/core/backends/tensordock/configurator.py +13 -7
- dstack/_internal/core/backends/vastai/configurator.py +11 -7
- dstack/_internal/core/backends/vultr/configurator.py +11 -4
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/runs.py +1 -0
- dstack/_internal/core/models/common.py +3 -3
- dstack/_internal/core/models/configurations.py +172 -27
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +5 -1
- dstack/_internal/core/models/profiles.py +41 -11
- dstack/_internal/core/models/resources.py +46 -42
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/configs/__init__.py +2 -2
- dstack/_internal/core/services/profiles.py +2 -2
- dstack/_internal/core/services/repos.py +5 -3
- dstack/_internal/core/services/ssh/ports.py +1 -1
- dstack/_internal/proxy/lib/deps.py +6 -2
- dstack/_internal/server/app.py +22 -17
- dstack/_internal/server/background/tasks/process_gateways.py +4 -1
- dstack/_internal/server/background/tasks/process_instances.py +10 -2
- dstack/_internal/server/background/tasks/process_probes.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_runs.py +1 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/db.py +8 -4
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/gpus.py +1 -6
- dstack/_internal/server/schemas/runner.py +10 -0
- dstack/_internal/server/services/backends/__init__.py +14 -8
- dstack/_internal/server/services/backends/handlers.py +6 -1
- dstack/_internal/server/services/docker.py +5 -5
- dstack/_internal/server/services/fleets.py +14 -13
- dstack/_internal/server/services/gateways/__init__.py +2 -0
- dstack/_internal/server/services/gateways/client.py +5 -2
- dstack/_internal/server/services/gateways/connection.py +1 -1
- dstack/_internal/server/services/gpus.py +50 -49
- dstack/_internal/server/services/instances.py +41 -1
- dstack/_internal/server/services/jobs/__init__.py +15 -4
- dstack/_internal/server/services/jobs/configurators/base.py +7 -11
- dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
- dstack/_internal/server/services/jobs/configurators/service.py +1 -0
- dstack/_internal/server/services/jobs/configurators/task.py +3 -0
- dstack/_internal/server/services/locking.py +5 -5
- dstack/_internal/server/services/logging.py +10 -2
- dstack/_internal/server/services/logs/__init__.py +8 -6
- dstack/_internal/server/services/logs/aws.py +330 -327
- dstack/_internal/server/services/logs/filelog.py +7 -6
- dstack/_internal/server/services/logs/gcp.py +141 -139
- dstack/_internal/server/services/plugins.py +1 -1
- dstack/_internal/server/services/projects.py +2 -5
- dstack/_internal/server/services/proxy/repo.py +5 -1
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +259 -0
- dstack/_internal/server/services/runner/client.py +7 -0
- dstack/_internal/server/services/runs.py +1 -1
- dstack/_internal/server/services/services/__init__.py +8 -2
- dstack/_internal/server/services/services/autoscalers.py +2 -0
- dstack/_internal/server/services/ssh.py +2 -1
- dstack/_internal/server/services/storage/__init__.py +5 -6
- dstack/_internal/server/services/storage/gcs.py +49 -49
- dstack/_internal/server/services/storage/s3.py +52 -52
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/testing/common.py +1 -1
- dstack/_internal/server/utils/logging.py +3 -3
- dstack/_internal/server/utils/provisioning.py +3 -3
- dstack/_internal/utils/json_schema.py +3 -1
- dstack/_internal/utils/typing.py +14 -0
- dstack/api/_public/repos.py +21 -2
- dstack/api/_public/runs.py +5 -7
- dstack/api/server/__init__.py +17 -19
- dstack/api/server/_gpus.py +2 -1
- dstack/api/server/_group.py +4 -3
- dstack/api/server/_repos.py +20 -3
- dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
- dstack/version.py +1 -1
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
- dstack/api/huggingface/__init__.py +0 -73
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -130,6 +130,12 @@ DEFAULT_GPU_COUNT = Range[int](min=1)
|
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
class CPUSpec(CoreModel):
|
|
133
|
+
arch: Annotated[
|
|
134
|
+
Optional[gpuhunt.CPUArchitecture],
|
|
135
|
+
Field(description="The CPU architecture, one of: `x86`, `arm`"),
|
|
136
|
+
] = None
|
|
137
|
+
count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
|
|
138
|
+
|
|
133
139
|
class Config(CoreModel.Config):
|
|
134
140
|
@staticmethod
|
|
135
141
|
def schema_extra(schema: Dict[str, Any]):
|
|
@@ -138,12 +144,6 @@ class CPUSpec(CoreModel):
|
|
|
138
144
|
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
139
145
|
)
|
|
140
146
|
|
|
141
|
-
arch: Annotated[
|
|
142
|
-
Optional[gpuhunt.CPUArchitecture],
|
|
143
|
-
Field(description="The CPU architecture, one of: `x86`, `arm`"),
|
|
144
|
-
] = None
|
|
145
|
-
count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
|
|
146
|
-
|
|
147
147
|
@classmethod
|
|
148
148
|
def __get_validators__(cls):
|
|
149
149
|
yield cls.parse
|
|
@@ -191,22 +191,6 @@ class CPUSpec(CoreModel):
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
class GPUSpec(CoreModel):
|
|
194
|
-
class Config(CoreModel.Config):
|
|
195
|
-
@staticmethod
|
|
196
|
-
def schema_extra(schema: Dict[str, Any]):
|
|
197
|
-
add_extra_schema_types(
|
|
198
|
-
schema["properties"]["count"],
|
|
199
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
200
|
-
)
|
|
201
|
-
add_extra_schema_types(
|
|
202
|
-
schema["properties"]["memory"],
|
|
203
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
204
|
-
)
|
|
205
|
-
add_extra_schema_types(
|
|
206
|
-
schema["properties"]["total_memory"],
|
|
207
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
208
|
-
)
|
|
209
|
-
|
|
210
194
|
vendor: Annotated[
|
|
211
195
|
Optional[gpuhunt.AcceleratorVendor],
|
|
212
196
|
Field(
|
|
@@ -234,6 +218,26 @@ class GPUSpec(CoreModel):
|
|
|
234
218
|
Field(description="The minimum compute capability of the GPU (e.g., `7.5`)"),
|
|
235
219
|
] = None
|
|
236
220
|
|
|
221
|
+
class Config(CoreModel.Config):
|
|
222
|
+
@staticmethod
|
|
223
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
224
|
+
add_extra_schema_types(
|
|
225
|
+
schema["properties"]["count"],
|
|
226
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
227
|
+
)
|
|
228
|
+
add_extra_schema_types(
|
|
229
|
+
schema["properties"]["name"],
|
|
230
|
+
extra_types=[{"type": "string"}],
|
|
231
|
+
)
|
|
232
|
+
add_extra_schema_types(
|
|
233
|
+
schema["properties"]["memory"],
|
|
234
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
235
|
+
)
|
|
236
|
+
add_extra_schema_types(
|
|
237
|
+
schema["properties"]["total_memory"],
|
|
238
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
239
|
+
)
|
|
240
|
+
|
|
237
241
|
@classmethod
|
|
238
242
|
def __get_validators__(cls):
|
|
239
243
|
yield cls.parse
|
|
@@ -314,6 +318,8 @@ class GPUSpec(CoreModel):
|
|
|
314
318
|
|
|
315
319
|
|
|
316
320
|
class DiskSpec(CoreModel):
|
|
321
|
+
size: Annotated[Range[Memory], Field(description="Disk size")]
|
|
322
|
+
|
|
317
323
|
class Config(CoreModel.Config):
|
|
318
324
|
@staticmethod
|
|
319
325
|
def schema_extra(schema: Dict[str, Any]):
|
|
@@ -322,8 +328,6 @@ class DiskSpec(CoreModel):
|
|
|
322
328
|
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
323
329
|
)
|
|
324
330
|
|
|
325
|
-
size: Annotated[Range[Memory], Field(description="Disk size")]
|
|
326
|
-
|
|
327
331
|
@classmethod
|
|
328
332
|
def __get_validators__(cls):
|
|
329
333
|
yield cls._parse
|
|
@@ -340,6 +344,24 @@ DEFAULT_DISK = DiskSpec(size=Range[Memory](min=Memory.parse("100GB"), max=None))
|
|
|
340
344
|
|
|
341
345
|
|
|
342
346
|
class ResourcesSpec(CoreModel):
|
|
347
|
+
# TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
|
|
348
|
+
cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
|
|
349
|
+
CPUSpec()
|
|
350
|
+
)
|
|
351
|
+
memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
|
|
352
|
+
DEFAULT_MEMORY_SIZE
|
|
353
|
+
)
|
|
354
|
+
shm_size: Annotated[
|
|
355
|
+
Optional[Memory],
|
|
356
|
+
Field(
|
|
357
|
+
description="The size of shared memory (e.g., `8GB`). "
|
|
358
|
+
"If you are using parallel communicating processes (e.g., dataloaders in PyTorch), "
|
|
359
|
+
"you may need to configure this"
|
|
360
|
+
),
|
|
361
|
+
] = None
|
|
362
|
+
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
|
|
363
|
+
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
|
|
364
|
+
|
|
343
365
|
class Config(CoreModel.Config):
|
|
344
366
|
@staticmethod
|
|
345
367
|
def schema_extra(schema: Dict[str, Any]):
|
|
@@ -364,24 +386,6 @@ class ResourcesSpec(CoreModel):
|
|
|
364
386
|
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
365
387
|
)
|
|
366
388
|
|
|
367
|
-
# TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
|
|
368
|
-
cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
|
|
369
|
-
CPUSpec()
|
|
370
|
-
)
|
|
371
|
-
memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
|
|
372
|
-
DEFAULT_MEMORY_SIZE
|
|
373
|
-
)
|
|
374
|
-
shm_size: Annotated[
|
|
375
|
-
Optional[Memory],
|
|
376
|
-
Field(
|
|
377
|
-
description="The size of shared memory (e.g., `8GB`). "
|
|
378
|
-
"If you are using parallel communicating processes (e.g., dataloaders in PyTorch), "
|
|
379
|
-
"you may need to configure this"
|
|
380
|
-
),
|
|
381
|
-
] = None
|
|
382
|
-
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
|
|
383
|
-
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
|
|
384
|
-
|
|
385
389
|
def pretty_format(self) -> str:
|
|
386
390
|
# TODO: Remove in 0.20. Use self.cpu directly
|
|
387
391
|
cpu = parse_obj_as(CPUSpec, self.cpu)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from datetime import datetime, timedelta
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import Any, Dict, List, Literal, Optional, Type
|
|
4
|
+
from urllib.parse import urlparse
|
|
4
5
|
|
|
5
6
|
from pydantic import UUID4, Field, root_validator
|
|
6
7
|
from typing_extensions import Annotated
|
|
@@ -483,6 +484,9 @@ class ServiceSpec(CoreModel):
|
|
|
483
484
|
model: Optional[ServiceModelSpec] = None
|
|
484
485
|
options: Dict[str, Any] = {}
|
|
485
486
|
|
|
487
|
+
def get_domain(self) -> Optional[str]:
|
|
488
|
+
return urlparse(self.url).hostname
|
|
489
|
+
|
|
486
490
|
|
|
487
491
|
class RunStatus(str, Enum):
|
|
488
492
|
PENDING = "pending"
|
|
@@ -68,8 +68,8 @@ class ConfigManager:
|
|
|
68
68
|
if len(self.config.projects) == 1:
|
|
69
69
|
self.config.projects[0].default = True
|
|
70
70
|
|
|
71
|
-
def
|
|
72
|
-
return
|
|
71
|
+
def list_project_configs(self) -> list[ProjectConfig]:
|
|
72
|
+
return self.config.projects
|
|
73
73
|
|
|
74
74
|
def delete_project(self, name: str):
|
|
75
75
|
self.config.projects = [p for p in self.config.projects if p.name != name]
|
|
@@ -37,10 +37,10 @@ def get_termination(
|
|
|
37
37
|
) -> Tuple[TerminationPolicy, int]:
|
|
38
38
|
termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
|
|
39
39
|
termination_idle_time = default_termination_idle_time
|
|
40
|
-
if profile.idle_duration is not None and
|
|
40
|
+
if profile.idle_duration is not None and profile.idle_duration < 0:
|
|
41
41
|
termination_policy = TerminationPolicy.DONT_DESTROY
|
|
42
42
|
elif profile.idle_duration is not None:
|
|
43
43
|
termination_idle_time = profile.idle_duration
|
|
44
44
|
if termination_policy == TerminationPolicy.DONT_DESTROY:
|
|
45
45
|
termination_idle_time = -1
|
|
46
|
-
return termination_policy,
|
|
46
|
+
return termination_policy, termination_idle_time
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Optional, Union
|
|
4
4
|
|
|
5
|
-
import git
|
|
5
|
+
import git.cmd
|
|
6
6
|
import requests
|
|
7
7
|
import yaml
|
|
8
8
|
from git.exc import GitCommandError
|
|
@@ -24,6 +24,8 @@ logger = get_logger(__name__)
|
|
|
24
24
|
gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml")
|
|
25
25
|
default_ssh_key = os.path.expanduser("~/.ssh/id_rsa")
|
|
26
26
|
|
|
27
|
+
no_prompt_env = dict(GIT_TERMINAL_PROMPT="0")
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
class InvalidRepoCredentialsError(DstackError):
|
|
29
31
|
pass
|
|
@@ -84,7 +86,7 @@ def get_local_repo_credentials(
|
|
|
84
86
|
|
|
85
87
|
def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> RemoteRepoCreds:
|
|
86
88
|
try:
|
|
87
|
-
git.cmd.Git().ls_remote(url.as_https(oauth_token), env=
|
|
89
|
+
git.cmd.Git().ls_remote(url.as_https(oauth_token), env=no_prompt_env)
|
|
88
90
|
except GitCommandError:
|
|
89
91
|
masked = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
|
|
90
92
|
raise InvalidRepoCredentialsError(
|
|
@@ -131,7 +133,7 @@ def get_default_branch(remote_url: str) -> Optional[str]:
|
|
|
131
133
|
Get the default branch of a remote Git repository.
|
|
132
134
|
"""
|
|
133
135
|
try:
|
|
134
|
-
output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD")
|
|
136
|
+
output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD", env=no_prompt_env)
|
|
135
137
|
for line in output.splitlines():
|
|
136
138
|
if line.startswith("ref:"):
|
|
137
139
|
return line.split()[1].split("/")[-1]
|
|
@@ -74,7 +74,7 @@ class PortsLock:
|
|
|
74
74
|
try:
|
|
75
75
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
76
76
|
if IS_WINDOWS:
|
|
77
|
-
sock.setsockopt(socket.SOL_SOCKET, socket.SO_EXCLUSIVEADDRUSE, 1)
|
|
77
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_EXCLUSIVEADDRUSE, 1) # type: ignore[attr-defined]
|
|
78
78
|
sock.bind(("", port))
|
|
79
79
|
return sock
|
|
80
80
|
except socket.error as e:
|
|
@@ -21,12 +21,16 @@ class ProxyDependencyInjector(ABC):
|
|
|
21
21
|
def __init__(self) -> None:
|
|
22
22
|
self._service_conn_pool = ServiceConnectionPool()
|
|
23
23
|
|
|
24
|
+
# Abstract AsyncGenerator does not need async def since
|
|
25
|
+
# type checkers infer a different type without yield in body.
|
|
26
|
+
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
27
|
+
|
|
24
28
|
@abstractmethod
|
|
25
|
-
|
|
29
|
+
def get_repo(self) -> AsyncGenerator[BaseProxyRepo, None]:
|
|
26
30
|
pass
|
|
27
31
|
|
|
28
32
|
@abstractmethod
|
|
29
|
-
|
|
33
|
+
def get_auth_provider(self) -> AsyncGenerator[BaseProxyAuthProvider, None]:
|
|
30
34
|
pass
|
|
31
35
|
|
|
32
36
|
async def get_service_connection_pool(self) -> ServiceConnectionPool:
|
dstack/_internal/server/app.py
CHANGED
|
@@ -110,9 +110,11 @@ async def lifespan(app: FastAPI):
|
|
|
110
110
|
_print_dstack_logo()
|
|
111
111
|
if not check_required_ssh_version():
|
|
112
112
|
logger.warning("OpenSSH 8.4+ is required. The dstack server may not work properly")
|
|
113
|
+
server_config_manager = None
|
|
114
|
+
server_config_loaded = False
|
|
113
115
|
if settings.SERVER_CONFIG_ENABLED:
|
|
114
116
|
server_config_manager = ServerConfigManager()
|
|
115
|
-
|
|
117
|
+
server_config_loaded = server_config_manager.load_config()
|
|
116
118
|
# Encryption has to be configured before working with users and projects
|
|
117
119
|
await server_config_manager.apply_encryption()
|
|
118
120
|
async with get_session_ctx() as session:
|
|
@@ -126,11 +128,9 @@ async def lifespan(app: FastAPI):
|
|
|
126
128
|
session=session,
|
|
127
129
|
user=admin,
|
|
128
130
|
)
|
|
129
|
-
if
|
|
130
|
-
server_config_dir =
|
|
131
|
-
|
|
132
|
-
)
|
|
133
|
-
if not config_loaded:
|
|
131
|
+
if server_config_manager is not None:
|
|
132
|
+
server_config_dir = _get_server_config_dir()
|
|
133
|
+
if not server_config_loaded:
|
|
134
134
|
logger.info("Initializing the default configuration...", {"show_path": False})
|
|
135
135
|
await server_config_manager.init_config(session=session)
|
|
136
136
|
logger.info(
|
|
@@ -153,6 +153,7 @@ async def lifespan(app: FastAPI):
|
|
|
153
153
|
)
|
|
154
154
|
if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
|
|
155
155
|
init_default_storage()
|
|
156
|
+
scheduler = None
|
|
156
157
|
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
157
158
|
scheduler = start_background_tasks()
|
|
158
159
|
else:
|
|
@@ -167,7 +168,7 @@ async def lifespan(app: FastAPI):
|
|
|
167
168
|
for func in _ON_STARTUP_HOOKS:
|
|
168
169
|
await func(app)
|
|
169
170
|
yield
|
|
170
|
-
if
|
|
171
|
+
if scheduler is not None:
|
|
171
172
|
scheduler.shutdown()
|
|
172
173
|
PROBES_SCHEDULER.shutdown(wait=False)
|
|
173
174
|
await gateway_connections_pool.remove_all()
|
|
@@ -371,6 +372,18 @@ def _is_prometheus_request(request: Request) -> bool:
|
|
|
371
372
|
return request.url.path.startswith("/metrics")
|
|
372
373
|
|
|
373
374
|
|
|
375
|
+
def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
|
|
376
|
+
parent_sampling_decision = sampling_context["parent_sampled"]
|
|
377
|
+
if parent_sampling_decision is not None:
|
|
378
|
+
return float(parent_sampling_decision)
|
|
379
|
+
transaction_context = sampling_context["transaction_context"]
|
|
380
|
+
name = transaction_context.get("name")
|
|
381
|
+
if name is not None:
|
|
382
|
+
if name.startswith("background."):
|
|
383
|
+
return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
|
|
384
|
+
return settings.SENTRY_TRACES_SAMPLE_RATE
|
|
385
|
+
|
|
386
|
+
|
|
374
387
|
def _print_dstack_logo():
|
|
375
388
|
console.print(
|
|
376
389
|
"""[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
|
|
@@ -387,13 +400,5 @@ def _print_dstack_logo():
|
|
|
387
400
|
)
|
|
388
401
|
|
|
389
402
|
|
|
390
|
-
def
|
|
391
|
-
|
|
392
|
-
if parent_sampling_decision is not None:
|
|
393
|
-
return float(parent_sampling_decision)
|
|
394
|
-
transaction_context = sampling_context["transaction_context"]
|
|
395
|
-
name = transaction_context.get("name")
|
|
396
|
-
if name is not None:
|
|
397
|
-
if name.startswith("background."):
|
|
398
|
-
return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
|
|
399
|
-
return settings.SENTRY_TRACES_SAMPLE_RATE
|
|
403
|
+
def _get_server_config_dir() -> str:
|
|
404
|
+
return str(SERVER_CONFIG_FILE_PATH).replace(os.path.expanduser("~"), "~", 1)
|
|
@@ -49,8 +49,8 @@ async def process_gateways():
|
|
|
49
49
|
if gateway_model is None:
|
|
50
50
|
return
|
|
51
51
|
lockset.add(gateway_model.id)
|
|
52
|
+
gateway_model_id = gateway_model.id
|
|
52
53
|
try:
|
|
53
|
-
gateway_model_id = gateway_model.id
|
|
54
54
|
initial_status = gateway_model.status
|
|
55
55
|
if initial_status == GatewayStatus.SUBMITTED:
|
|
56
56
|
await _process_submitted_gateway(session=session, gateway_model=gateway_model)
|
|
@@ -165,6 +165,9 @@ async def _process_provisioning_gateway(
|
|
|
165
165
|
)
|
|
166
166
|
gateway_model = res.unique().scalar_one()
|
|
167
167
|
|
|
168
|
+
# Provisioning gateways must have compute.
|
|
169
|
+
assert gateway_model.gateway_compute is not None
|
|
170
|
+
|
|
168
171
|
# FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
|
|
169
172
|
# - cannot delete the gateway before it is provisioned because the DB model is locked
|
|
170
173
|
# - connection retry counter is reset on server restart
|
|
@@ -85,8 +85,10 @@ from dstack._internal.server.services.instances import (
|
|
|
85
85
|
get_instance_provisioning_data,
|
|
86
86
|
get_instance_requirements,
|
|
87
87
|
get_instance_ssh_private_keys,
|
|
88
|
+
remove_dangling_tasks_from_instance,
|
|
88
89
|
)
|
|
89
90
|
from dstack._internal.server.services.locking import get_locker
|
|
91
|
+
from dstack._internal.server.services.logging import fmt
|
|
90
92
|
from dstack._internal.server.services.offers import is_divisible_into_blocks
|
|
91
93
|
from dstack._internal.server.services.placement import (
|
|
92
94
|
get_fleet_placement_group_models,
|
|
@@ -181,8 +183,8 @@ async def _process_next_instance():
|
|
|
181
183
|
if instance is None:
|
|
182
184
|
return
|
|
183
185
|
lockset.add(instance.id)
|
|
186
|
+
instance_model_id = instance.id
|
|
184
187
|
try:
|
|
185
|
-
instance_model_id = instance.id
|
|
186
188
|
await _process_instance(session=session, instance=instance)
|
|
187
189
|
finally:
|
|
188
190
|
lockset.difference_update([instance_model_id])
|
|
@@ -393,6 +395,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
393
395
|
return
|
|
394
396
|
|
|
395
397
|
region = instance.region
|
|
398
|
+
assert region is not None # always set for ssh instances
|
|
396
399
|
jpd = JobProvisioningData(
|
|
397
400
|
backend=BackendType.REMOTE,
|
|
398
401
|
instance_type=instance_type,
|
|
@@ -788,6 +791,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
|
|
|
788
791
|
ssh_private_keys,
|
|
789
792
|
job_provisioning_data,
|
|
790
793
|
None,
|
|
794
|
+
instance=instance,
|
|
791
795
|
check_instance_health=check_instance_health,
|
|
792
796
|
)
|
|
793
797
|
if instance_check is False:
|
|
@@ -934,7 +938,7 @@ async def _wait_for_instance_provisioning_data(
|
|
|
934
938
|
|
|
935
939
|
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
|
|
936
940
|
def _check_instance_inner(
|
|
937
|
-
ports: Dict[int, int], *, check_instance_health: bool = False
|
|
941
|
+
ports: Dict[int, int], *, instance: InstanceModel, check_instance_health: bool = False
|
|
938
942
|
) -> InstanceCheck:
|
|
939
943
|
instance_health_response: Optional[InstanceHealthResponse] = None
|
|
940
944
|
shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
@@ -954,6 +958,10 @@ def _check_instance_inner(
|
|
|
954
958
|
args = (method.__func__.__name__, e.__class__.__name__, e)
|
|
955
959
|
logger.exception(template, *args)
|
|
956
960
|
return InstanceCheck(reachable=False, message=template % args)
|
|
961
|
+
try:
|
|
962
|
+
remove_dangling_tasks_from_instance(shim_client, instance)
|
|
963
|
+
except Exception as e:
|
|
964
|
+
logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e)
|
|
957
965
|
return runner_client.healthcheck_response_to_instance_check(
|
|
958
966
|
healthcheck_response, instance_health_response
|
|
959
967
|
)
|
|
@@ -120,7 +120,7 @@ async def _execute_probe(probe: ProbeModel, probe_spec: ProbeSpec) -> bool:
|
|
|
120
120
|
method=probe_spec.method,
|
|
121
121
|
url="http://dstack" + probe_spec.url,
|
|
122
122
|
headers=[(h.name, h.value) for h in probe_spec.headers],
|
|
123
|
-
|
|
123
|
+
content=probe_spec.body,
|
|
124
124
|
timeout=probe_spec.timeout,
|
|
125
125
|
follow_redirects=False,
|
|
126
126
|
)
|
|
@@ -128,9 +128,8 @@ async def _process_next_running_job():
|
|
|
128
128
|
if job_model is None:
|
|
129
129
|
return
|
|
130
130
|
lockset.add(job_model.id)
|
|
131
|
-
|
|
131
|
+
job_model_id = job_model.id
|
|
132
132
|
try:
|
|
133
|
-
job_model_id = job_model.id
|
|
134
133
|
await _process_running_job(session=session, job_model=job_model)
|
|
135
134
|
finally:
|
|
136
135
|
lockset.difference_update([job_model_id])
|
|
@@ -170,6 +169,11 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
170
169
|
|
|
171
170
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
172
171
|
|
|
172
|
+
volumes = []
|
|
173
|
+
secrets = {}
|
|
174
|
+
cluster_info = None
|
|
175
|
+
repo_creds = None
|
|
176
|
+
|
|
173
177
|
initial_status = job_model.status
|
|
174
178
|
if initial_status in [JobStatus.PROVISIONING, JobStatus.PULLING]:
|
|
175
179
|
# Wait until all other jobs in the replica are provisioned
|
|
@@ -257,6 +261,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
257
261
|
user_ssh_key,
|
|
258
262
|
)
|
|
259
263
|
else:
|
|
264
|
+
assert cluster_info is not None
|
|
260
265
|
logger.debug(
|
|
261
266
|
"%s: process provisioning job without shim, age=%s",
|
|
262
267
|
fmt(job_model),
|
|
@@ -275,7 +280,6 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
275
280
|
repo=repo_model,
|
|
276
281
|
code_hash=_get_repo_code_hash(run, job),
|
|
277
282
|
)
|
|
278
|
-
|
|
279
283
|
success = await common_utils.run_async(
|
|
280
284
|
_submit_job_to_runner,
|
|
281
285
|
server_ssh_private_keys,
|
|
@@ -309,6 +313,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
309
313
|
|
|
310
314
|
else: # fails are not acceptable
|
|
311
315
|
if initial_status == JobStatus.PULLING:
|
|
316
|
+
assert cluster_info is not None
|
|
312
317
|
logger.debug(
|
|
313
318
|
"%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
|
|
314
319
|
)
|
|
@@ -341,7 +346,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
341
346
|
server_ssh_private_keys,
|
|
342
347
|
job_provisioning_data,
|
|
343
348
|
)
|
|
344
|
-
|
|
349
|
+
else:
|
|
345
350
|
logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
|
|
346
351
|
success = await common_utils.run_async(
|
|
347
352
|
_process_running,
|
|
@@ -632,6 +637,7 @@ def _process_pulling_with_shim(
|
|
|
632
637
|
is successful
|
|
633
638
|
"""
|
|
634
639
|
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
640
|
+
job_runtime_data = None
|
|
635
641
|
if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
|
|
636
642
|
task = shim_client.get_task(job_model.id)
|
|
637
643
|
|
|
@@ -129,8 +129,8 @@ async def _process_next_run():
|
|
|
129
129
|
job_ids = [j.id for j in run_model.jobs]
|
|
130
130
|
run_lockset.add(run_model.id)
|
|
131
131
|
job_lockset.update(job_ids)
|
|
132
|
+
run_model_id = run_model.id
|
|
132
133
|
try:
|
|
133
|
-
run_model_id = run_model.id
|
|
134
134
|
await _process_run(session=session, run_model=run_model)
|
|
135
135
|
finally:
|
|
136
136
|
run_lockset.difference_update([run_model_id])
|