dstack 0.19.26__py3-none-any.whl → 0.19.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +11 -8
- dstack/_internal/cli/commands/apply.py +6 -3
- dstack/_internal/cli/commands/completion.py +3 -1
- dstack/_internal/cli/commands/config.py +1 -0
- dstack/_internal/cli/commands/init.py +4 -4
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/commands/project.py +1 -0
- dstack/_internal/cli/commands/server.py +2 -2
- dstack/_internal/cli/main.py +1 -1
- dstack/_internal/cli/services/configurators/base.py +2 -4
- dstack/_internal/cli/services/configurators/fleet.py +4 -5
- dstack/_internal/cli/services/configurators/gateway.py +3 -5
- dstack/_internal/cli/services/configurators/run.py +165 -43
- dstack/_internal/cli/services/configurators/volume.py +3 -5
- dstack/_internal/cli/services/repos.py +1 -18
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +0 -7
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/runs.py +10 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/common.py +67 -43
- dstack/_internal/core/models/configurations.py +109 -69
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +115 -25
- dstack/_internal/core/models/instances.py +5 -5
- dstack/_internal/core/models/profiles.py +66 -47
- dstack/_internal/core/models/repos/remote.py +21 -16
- dstack/_internal/core/models/resources.py +69 -65
- dstack/_internal/core/models/runs.py +41 -14
- dstack/_internal/core/services/repos.py +85 -80
- dstack/_internal/server/app.py +5 -0
- dstack/_internal/server/background/tasks/process_fleets.py +117 -13
- dstack/_internal/server/background/tasks/process_instances.py +12 -71
- dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
- dstack/_internal/server/background/tasks/process_runs.py +2 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +48 -16
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +11 -7
- dstack/_internal/server/schemas/gateways.py +10 -9
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/handlers.py +2 -0
- dstack/_internal/server/services/docker.py +8 -7
- dstack/_internal/server/services/fleets.py +23 -25
- dstack/_internal/server/services/instances.py +3 -3
- dstack/_internal/server/services/jobs/configurators/base.py +46 -6
- dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
- dstack/_internal/server/services/jobs/configurators/service.py +0 -3
- dstack/_internal/server/services/jobs/configurators/task.py +0 -3
- dstack/_internal/server/services/projects.py +52 -1
- dstack/_internal/server/services/runs.py +16 -0
- dstack/_internal/server/settings.py +46 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-5e0d56245c4bd241ec27.css} +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-a2a16772fbf11a14d191.js} +1215 -998
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-a2a16772fbf11a14d191.js.map} +1 -1
- dstack/_internal/server/testing/common.py +6 -3
- dstack/_internal/utils/env.py +85 -11
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/api/_public/repos.py +41 -6
- dstack/api/_public/runs.py +14 -1
- dstack/version.py +1 -1
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/METADATA +2 -2
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/RECORD +92 -78
- dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +0 -3
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/WHEEL +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from contextlib import suppress
|
|
2
3
|
from pathlib import Path
|
|
4
|
+
from tempfile import NamedTemporaryFile
|
|
3
5
|
from typing import Optional, Union
|
|
4
6
|
|
|
5
7
|
import git.cmd
|
|
6
|
-
import requests
|
|
7
8
|
import yaml
|
|
8
9
|
from git.exc import GitCommandError
|
|
9
10
|
|
|
@@ -13,135 +14,139 @@ from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepo
|
|
|
13
14
|
from dstack._internal.core.models.repos.remote import GitRepoURL
|
|
14
15
|
from dstack._internal.utils.logging import get_logger
|
|
15
16
|
from dstack._internal.utils.path import PathLike
|
|
16
|
-
from dstack._internal.utils.ssh import
|
|
17
|
-
get_host_config,
|
|
18
|
-
make_ssh_command_for_git,
|
|
19
|
-
try_ssh_key_passphrase,
|
|
20
|
-
)
|
|
17
|
+
from dstack._internal.utils.ssh import get_host_config, make_git_env, try_ssh_key_passphrase
|
|
21
18
|
|
|
22
19
|
logger = get_logger(__name__)
|
|
23
20
|
|
|
24
21
|
gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml")
|
|
25
22
|
default_ssh_key = os.path.expanduser("~/.ssh/id_rsa")
|
|
26
23
|
|
|
27
|
-
no_prompt_env = dict(GIT_TERMINAL_PROMPT="0")
|
|
28
|
-
|
|
29
24
|
|
|
30
25
|
class InvalidRepoCredentialsError(DstackError):
|
|
31
26
|
pass
|
|
32
27
|
|
|
33
28
|
|
|
34
|
-
def
|
|
29
|
+
def get_repo_creds_and_default_branch(
|
|
35
30
|
repo_url: str,
|
|
36
31
|
identity_file: Optional[PathLike] = None,
|
|
32
|
+
private_key: Optional[str] = None,
|
|
37
33
|
oauth_token: Optional[str] = None,
|
|
38
|
-
) -> RemoteRepoCreds:
|
|
34
|
+
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
39
35
|
url = GitRepoURL.parse(repo_url, get_ssh_config=get_host_config)
|
|
40
36
|
|
|
41
37
|
# no auth
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
38
|
+
with suppress(InvalidRepoCredentialsError):
|
|
39
|
+
return _get_repo_creds_and_default_branch_https(url)
|
|
40
|
+
|
|
41
|
+
# ssh key provided by the user or pulled from the server
|
|
42
|
+
if identity_file is not None or private_key is not None:
|
|
43
|
+
if identity_file is not None:
|
|
44
|
+
private_key = _read_private_key(identity_file)
|
|
45
|
+
return _get_repo_creds_and_default_branch_ssh(url, identity_file, private_key)
|
|
46
|
+
elif private_key is not None:
|
|
47
|
+
with NamedTemporaryFile("w+", 0o600) as f:
|
|
48
|
+
f.write(private_key)
|
|
49
|
+
f.flush()
|
|
50
|
+
return _get_repo_creds_and_default_branch_ssh(url, f.name, private_key)
|
|
51
|
+
else:
|
|
52
|
+
assert False, "should not reach here"
|
|
53
|
+
|
|
54
|
+
# oauth token provided by the user or pulled from the server
|
|
56
55
|
if oauth_token is not None:
|
|
57
|
-
return
|
|
56
|
+
return _get_repo_creds_and_default_branch_https(url, oauth_token)
|
|
58
57
|
|
|
59
58
|
# key from ssh config
|
|
60
59
|
identities = get_host_config(url.original_host).get("identityfile")
|
|
61
60
|
if identities:
|
|
62
|
-
|
|
61
|
+
_identity_file = identities[0]
|
|
62
|
+
with suppress(InvalidRepoCredentialsError):
|
|
63
|
+
_private_key = _read_private_key(_identity_file)
|
|
64
|
+
return _get_repo_creds_and_default_branch_ssh(url, _identity_file, _private_key)
|
|
63
65
|
|
|
64
66
|
# token from gh config
|
|
65
67
|
if os.path.exists(gh_config_path):
|
|
66
68
|
with open(gh_config_path, "r") as f:
|
|
67
69
|
gh_hosts = yaml.load(f, Loader=yaml.FullLoader)
|
|
68
|
-
|
|
69
|
-
if
|
|
70
|
-
|
|
71
|
-
return
|
|
72
|
-
except InvalidRepoCredentialsError:
|
|
73
|
-
pass
|
|
70
|
+
_oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
|
|
71
|
+
if _oauth_token is not None:
|
|
72
|
+
with suppress(InvalidRepoCredentialsError):
|
|
73
|
+
return _get_repo_creds_and_default_branch_https(url, _oauth_token)
|
|
74
74
|
|
|
75
75
|
# default user key
|
|
76
76
|
if os.path.exists(default_ssh_key):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
pass
|
|
77
|
+
with suppress(InvalidRepoCredentialsError):
|
|
78
|
+
_private_key = _read_private_key(default_ssh_key)
|
|
79
|
+
return _get_repo_creds_and_default_branch_ssh(url, default_ssh_key, _private_key)
|
|
81
80
|
|
|
82
81
|
raise InvalidRepoCredentialsError(
|
|
83
82
|
"No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
|
|
84
83
|
)
|
|
85
84
|
|
|
86
85
|
|
|
87
|
-
def
|
|
86
|
+
def _get_repo_creds_and_default_branch_ssh(
|
|
87
|
+
url: GitRepoURL, identity_file: PathLike, private_key: str
|
|
88
|
+
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
89
|
+
_url = url.as_ssh()
|
|
88
90
|
try:
|
|
89
|
-
|
|
90
|
-
except GitCommandError:
|
|
91
|
-
|
|
92
|
-
raise InvalidRepoCredentialsError(
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
91
|
+
default_branch = _get_repo_default_branch(_url, make_git_env(identity_file=identity_file))
|
|
92
|
+
except GitCommandError as e:
|
|
93
|
+
message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
|
|
94
|
+
raise InvalidRepoCredentialsError(message) from e
|
|
95
|
+
creds = RemoteRepoCreds(
|
|
96
|
+
clone_url=_url,
|
|
97
|
+
private_key=private_key,
|
|
98
|
+
oauth_token=None,
|
|
99
|
+
)
|
|
100
|
+
return creds, default_branch
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_repo_creds_and_default_branch_https(
|
|
104
|
+
url: GitRepoURL, oauth_token: Optional[str] = None
|
|
105
|
+
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
106
|
+
_url = url.as_https()
|
|
107
|
+
try:
|
|
108
|
+
default_branch = _get_repo_default_branch(url.as_https(oauth_token), make_git_env())
|
|
109
|
+
except GitCommandError as e:
|
|
110
|
+
message = f"Cannot access `{_url}`"
|
|
111
|
+
if oauth_token is not None:
|
|
112
|
+
masked_token = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
|
|
113
|
+
message = f"{message} using the `{masked_token}` token"
|
|
114
|
+
raise InvalidRepoCredentialsError(message) from e
|
|
115
|
+
creds = RemoteRepoCreds(
|
|
116
|
+
clone_url=_url,
|
|
98
117
|
private_key=None,
|
|
118
|
+
oauth_token=oauth_token,
|
|
99
119
|
)
|
|
120
|
+
return creds, default_branch
|
|
100
121
|
|
|
101
122
|
|
|
102
|
-
def
|
|
123
|
+
def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
|
|
124
|
+
# output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
|
|
125
|
+
output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
|
|
126
|
+
for line in output.splitlines():
|
|
127
|
+
# line format: `<oid> TAB <ref> LF`
|
|
128
|
+
oid, _, ref = line.partition("\t")
|
|
129
|
+
if oid.startswith("ref:") and ref == "HEAD":
|
|
130
|
+
return oid.rsplit("/", maxsplit=1)[-1]
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _read_private_key(identity_file: PathLike) -> str:
|
|
135
|
+
identity_file = Path(identity_file).expanduser().resolve()
|
|
103
136
|
if not Path(identity_file).exists():
|
|
104
|
-
raise InvalidRepoCredentialsError(f"The {identity_file} private SSH key doesn't exist")
|
|
137
|
+
raise InvalidRepoCredentialsError(f"The `{identity_file}` private SSH key doesn't exist")
|
|
105
138
|
if not os.access(identity_file, os.R_OK):
|
|
106
|
-
raise InvalidRepoCredentialsError(f"
|
|
139
|
+
raise InvalidRepoCredentialsError(f"Cannot access the `{identity_file}` private SSH key")
|
|
107
140
|
if not try_ssh_key_passphrase(identity_file):
|
|
108
141
|
raise InvalidRepoCredentialsError(
|
|
109
142
|
f"Cannot use the `{identity_file}` private SSH key. "
|
|
110
143
|
"Ensure that it is valid and passphrase-free"
|
|
111
144
|
)
|
|
112
|
-
with open(identity_file, "r") as
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
git.cmd.Git().ls_remote(
|
|
117
|
-
url.as_ssh(), env=dict(GIT_SSH_COMMAND=make_ssh_command_for_git(identity_file))
|
|
118
|
-
)
|
|
119
|
-
except GitCommandError:
|
|
120
|
-
raise InvalidRepoCredentialsError(
|
|
121
|
-
f"Can't access `{url.as_ssh()}` using the `{identity_file}` private SSH key"
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
return RemoteRepoCreds(
|
|
125
|
-
clone_url=url.as_ssh(),
|
|
126
|
-
private_key=private_key,
|
|
127
|
-
oauth_token=None,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def get_default_branch(remote_url: str) -> Optional[str]:
|
|
132
|
-
"""
|
|
133
|
-
Get the default branch of a remote Git repository.
|
|
134
|
-
"""
|
|
135
|
-
try:
|
|
136
|
-
output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD", env=no_prompt_env)
|
|
137
|
-
for line in output.splitlines():
|
|
138
|
-
if line.startswith("ref:"):
|
|
139
|
-
return line.split()[1].split("/")[-1]
|
|
140
|
-
except Exception as e:
|
|
141
|
-
logger.debug("Failed to get remote repo default branch: %s", repr(e))
|
|
142
|
-
return None
|
|
145
|
+
with open(identity_file, "r") as file:
|
|
146
|
+
return file.read()
|
|
143
147
|
|
|
144
148
|
|
|
149
|
+
# Used for `config.yml` only, remove it with `repos` in `config.yml`
|
|
145
150
|
def load_repo(config: RepoConfig) -> Union[RemoteRepo, LocalRepo]:
|
|
146
151
|
if config.repo_type == "remote":
|
|
147
152
|
return RemoteRepo(repo_id=config.repo_id, local_repo_dir=config.path)
|
dstack/_internal/server/app.py
CHANGED
|
@@ -160,6 +160,11 @@ async def lifespan(app: FastAPI):
|
|
|
160
160
|
logger.info("Background processing is disabled")
|
|
161
161
|
PROBES_SCHEDULER.start()
|
|
162
162
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
163
|
+
logger.info(
|
|
164
|
+
"Job network mode: %s (%d)",
|
|
165
|
+
settings.JOB_NETWORK_MODE.name,
|
|
166
|
+
settings.JOB_NETWORK_MODE.value,
|
|
167
|
+
)
|
|
163
168
|
logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
|
|
164
169
|
logger.info(
|
|
165
170
|
f"The dstack server {dstack_version} is running at {SERVER_URL}",
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from datetime import timedelta
|
|
2
2
|
from typing import List
|
|
3
|
+
from uuid import UUID
|
|
3
4
|
|
|
4
5
|
from sqlalchemy import select, update
|
|
5
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
7
|
from sqlalchemy.orm import joinedload, load_only
|
|
7
8
|
|
|
8
|
-
from dstack._internal.core.models.fleets import FleetStatus
|
|
9
|
+
from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
|
|
10
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
9
11
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
10
12
|
from dstack._internal.server.models import (
|
|
11
13
|
FleetModel,
|
|
@@ -15,7 +17,9 @@ from dstack._internal.server.models import (
|
|
|
15
17
|
RunModel,
|
|
16
18
|
)
|
|
17
19
|
from dstack._internal.server.services.fleets import (
|
|
20
|
+
create_fleet_instance_model,
|
|
18
21
|
get_fleet_spec,
|
|
22
|
+
get_next_instance_num,
|
|
19
23
|
is_fleet_empty,
|
|
20
24
|
is_fleet_in_use,
|
|
21
25
|
)
|
|
@@ -65,34 +69,122 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
|
|
|
65
69
|
res = await session.execute(
|
|
66
70
|
select(FleetModel)
|
|
67
71
|
.where(FleetModel.id.in_(fleet_ids))
|
|
68
|
-
.options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
|
|
69
72
|
.options(
|
|
70
|
-
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
|
|
73
|
+
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
|
|
74
|
+
joinedload(FleetModel.project),
|
|
71
75
|
)
|
|
72
76
|
.options(joinedload(FleetModel.runs).load_only(RunModel.status))
|
|
73
77
|
.execution_options(populate_existing=True)
|
|
74
78
|
)
|
|
75
79
|
fleet_models = list(res.unique().scalars().all())
|
|
76
80
|
|
|
81
|
+
# TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
|
|
77
82
|
deleted_fleets_ids = []
|
|
78
|
-
now = get_current_datetime()
|
|
79
83
|
for fleet_model in fleet_models:
|
|
84
|
+
_consolidate_fleet_state_with_spec(session, fleet_model)
|
|
80
85
|
deleted = _autodelete_fleet(fleet_model)
|
|
81
86
|
if deleted:
|
|
82
87
|
deleted_fleets_ids.append(fleet_model.id)
|
|
83
|
-
fleet_model.last_processed_at =
|
|
88
|
+
fleet_model.last_processed_at = get_current_datetime()
|
|
89
|
+
await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
|
|
90
|
+
await session.commit()
|
|
84
91
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
|
|
93
|
+
def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
|
|
94
|
+
if fleet_model.status == FleetStatus.TERMINATING:
|
|
95
|
+
return
|
|
96
|
+
fleet_spec = get_fleet_spec(fleet_model)
|
|
97
|
+
if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
|
|
98
|
+
# Only explicitly created cloud fleets are consolidated.
|
|
99
|
+
return
|
|
100
|
+
if not _is_fleet_ready_for_consolidation(fleet_model):
|
|
101
|
+
return
|
|
102
|
+
added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
|
|
103
|
+
if added_instances:
|
|
104
|
+
fleet_model.consolidation_attempt += 1
|
|
105
|
+
else:
|
|
106
|
+
# The fleet is already consolidated or consolidation is in progress.
|
|
107
|
+
# We reset consolidation_attempt in both cases for simplicity.
|
|
108
|
+
# The second case does not need reset but is ok to do since
|
|
109
|
+
# it means consolidation is longer than delay, so it won't happen too often.
|
|
110
|
+
# TODO: Reset consolidation_attempt on fleet in-place update.
|
|
111
|
+
fleet_model.consolidation_attempt = 0
|
|
112
|
+
fleet_model.last_consolidated_at = get_current_datetime()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
|
|
116
|
+
consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
|
|
117
|
+
last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
|
|
118
|
+
duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
|
|
119
|
+
return duration_since_last_consolidation >= consolidation_retry_delay
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# We use exponentially increasing consolidation retry delays so that
|
|
123
|
+
# consolidation does not happen too often. In particular, this prevents
|
|
124
|
+
# retrying instance provisioning constantly in case of no offers.
|
|
125
|
+
# TODO: Adjust delays.
|
|
126
|
+
_CONSOLIDATION_RETRY_DELAYS = [
|
|
127
|
+
timedelta(seconds=30),
|
|
128
|
+
timedelta(minutes=1),
|
|
129
|
+
timedelta(minutes=2),
|
|
130
|
+
timedelta(minutes=5),
|
|
131
|
+
timedelta(minutes=10),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
|
|
136
|
+
if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
|
|
137
|
+
return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
|
|
138
|
+
return _CONSOLIDATION_RETRY_DELAYS[-1]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _maintain_fleet_nodes_min(
|
|
142
|
+
session: AsyncSession,
|
|
143
|
+
fleet_model: FleetModel,
|
|
144
|
+
fleet_spec: FleetSpec,
|
|
145
|
+
) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Ensures the fleet has at least `nodes.min` instances.
|
|
148
|
+
Returns `True` if retried or added new instances and `False` otherwise.
|
|
149
|
+
"""
|
|
150
|
+
assert fleet_spec.configuration.nodes is not None
|
|
151
|
+
for instance in fleet_model.instances:
|
|
152
|
+
# Delete terminated but not deleted instances since
|
|
153
|
+
# they are going to be replaced with new pending instances.
|
|
154
|
+
if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
|
|
155
|
+
# It's safe to modify instances without instance lock since
|
|
156
|
+
# no other task modifies already terminated instances.
|
|
157
|
+
instance.deleted = True
|
|
158
|
+
instance.deleted_at = get_current_datetime()
|
|
159
|
+
active_instances = [i for i in fleet_model.instances if not i.deleted]
|
|
160
|
+
active_instances_num = len(active_instances)
|
|
161
|
+
if active_instances_num >= fleet_spec.configuration.nodes.min:
|
|
162
|
+
return False
|
|
163
|
+
nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
|
|
164
|
+
for i in range(nodes_missing):
|
|
165
|
+
instance_model = create_fleet_instance_model(
|
|
166
|
+
session=session,
|
|
167
|
+
project=fleet_model.project,
|
|
168
|
+
# TODO: Store fleet.user and pass it instead of the project owner.
|
|
169
|
+
username=fleet_model.project.owner.name,
|
|
170
|
+
spec=fleet_spec,
|
|
171
|
+
instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
|
|
89
172
|
)
|
|
90
|
-
.
|
|
91
|
-
|
|
92
|
-
|
|
173
|
+
active_instances.append(instance_model)
|
|
174
|
+
fleet_model.instances.append(instance_model)
|
|
175
|
+
logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
|
|
176
|
+
return True
|
|
93
177
|
|
|
94
178
|
|
|
95
179
|
def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
180
|
+
if fleet_model.project.deleted:
|
|
181
|
+
# It used to be possible to delete project with active resources:
|
|
182
|
+
# https://github.com/dstackai/dstack/issues/3077
|
|
183
|
+
fleet_model.status = FleetStatus.TERMINATED
|
|
184
|
+
fleet_model.deleted = True
|
|
185
|
+
logger.info("Fleet %s deleted due to deleted project", fleet_model.name)
|
|
186
|
+
return True
|
|
187
|
+
|
|
96
188
|
if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
|
|
97
189
|
return False
|
|
98
190
|
|
|
@@ -100,7 +192,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
|
100
192
|
if (
|
|
101
193
|
fleet_model.status != FleetStatus.TERMINATING
|
|
102
194
|
and fleet_spec.configuration.nodes is not None
|
|
103
|
-
and
|
|
195
|
+
and fleet_spec.configuration.nodes.min == 0
|
|
104
196
|
):
|
|
105
197
|
# Empty fleets that allow 0 nodes should not be auto-deleted
|
|
106
198
|
return False
|
|
@@ -110,3 +202,15 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
|
110
202
|
fleet_model.deleted = True
|
|
111
203
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
112
204
|
return True
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
|
|
208
|
+
if len(fleets_ids) == 0:
|
|
209
|
+
return
|
|
210
|
+
await session.execute(
|
|
211
|
+
update(PlacementGroupModel)
|
|
212
|
+
.where(
|
|
213
|
+
PlacementGroupModel.fleet_id.in_(fleets_ids),
|
|
214
|
+
)
|
|
215
|
+
.values(fleet_deleted=True)
|
|
216
|
+
)
|
|
@@ -53,14 +53,12 @@ from dstack._internal.core.models.placement import (
|
|
|
53
53
|
PlacementStrategy,
|
|
54
54
|
)
|
|
55
55
|
from dstack._internal.core.models.profiles import (
|
|
56
|
-
RetryEvent,
|
|
57
56
|
TerminationPolicy,
|
|
58
57
|
)
|
|
59
58
|
from dstack._internal.core.models.runs import (
|
|
60
59
|
JobProvisioningData,
|
|
61
60
|
Retry,
|
|
62
61
|
)
|
|
63
|
-
from dstack._internal.core.services.profiles import get_retry
|
|
64
62
|
from dstack._internal.server import settings as server_settings
|
|
65
63
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
66
64
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
@@ -327,7 +325,6 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
327
325
|
e,
|
|
328
326
|
)
|
|
329
327
|
instance.status = InstanceStatus.PENDING
|
|
330
|
-
instance.last_retry_at = get_current_datetime()
|
|
331
328
|
return
|
|
332
329
|
|
|
333
330
|
instance_type = host_info_to_instance_type(host_info, cpu_arch)
|
|
@@ -426,7 +423,6 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
426
423
|
instance.offer = instance_offer.json()
|
|
427
424
|
instance.job_provisioning_data = jpd.json()
|
|
428
425
|
instance.started_at = get_current_datetime()
|
|
429
|
-
instance.last_retry_at = get_current_datetime()
|
|
430
426
|
|
|
431
427
|
|
|
432
428
|
def _deploy_instance(
|
|
@@ -493,29 +489,6 @@ def _deploy_instance(
|
|
|
493
489
|
|
|
494
490
|
|
|
495
491
|
async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
496
|
-
if instance.last_retry_at is not None:
|
|
497
|
-
last_retry = instance.last_retry_at
|
|
498
|
-
if get_current_datetime() < last_retry + timedelta(minutes=1):
|
|
499
|
-
return
|
|
500
|
-
|
|
501
|
-
if (
|
|
502
|
-
instance.profile is None
|
|
503
|
-
or instance.requirements is None
|
|
504
|
-
or instance.instance_configuration is None
|
|
505
|
-
):
|
|
506
|
-
instance.status = InstanceStatus.TERMINATED
|
|
507
|
-
instance.termination_reason = "Empty profile, requirements or instance_configuration"
|
|
508
|
-
instance.last_retry_at = get_current_datetime()
|
|
509
|
-
logger.warning(
|
|
510
|
-
"Empty profile, requirements or instance_configuration. Terminate instance: %s",
|
|
511
|
-
instance.name,
|
|
512
|
-
extra={
|
|
513
|
-
"instance_name": instance.name,
|
|
514
|
-
"instance_status": InstanceStatus.TERMINATED.value,
|
|
515
|
-
},
|
|
516
|
-
)
|
|
517
|
-
return
|
|
518
|
-
|
|
519
492
|
if _need_to_wait_fleet_provisioning(instance):
|
|
520
493
|
logger.debug("Waiting for the first instance in the fleet to be provisioned")
|
|
521
494
|
return
|
|
@@ -529,7 +502,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
529
502
|
instance.termination_reason = (
|
|
530
503
|
f"Error to parse profile, requirements or instance_configuration: {e}"
|
|
531
504
|
)
|
|
532
|
-
instance.last_retry_at = get_current_datetime()
|
|
533
505
|
logger.warning(
|
|
534
506
|
"Error to parse profile, requirements or instance_configuration. Terminate instance: %s",
|
|
535
507
|
instance.name,
|
|
@@ -540,24 +512,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
540
512
|
)
|
|
541
513
|
return
|
|
542
514
|
|
|
543
|
-
retry = get_retry(profile)
|
|
544
|
-
should_retry = retry is not None and RetryEvent.NO_CAPACITY in retry.on_events
|
|
545
|
-
|
|
546
|
-
if retry is not None:
|
|
547
|
-
retry_duration_deadline = _get_retry_duration_deadline(instance, retry)
|
|
548
|
-
if get_current_datetime() > retry_duration_deadline:
|
|
549
|
-
instance.status = InstanceStatus.TERMINATED
|
|
550
|
-
instance.termination_reason = "Retry duration expired"
|
|
551
|
-
logger.warning(
|
|
552
|
-
"Retry duration expired. Terminating instance %s",
|
|
553
|
-
instance.name,
|
|
554
|
-
extra={
|
|
555
|
-
"instance_name": instance.name,
|
|
556
|
-
"instance_status": InstanceStatus.TERMINATED.value,
|
|
557
|
-
},
|
|
558
|
-
)
|
|
559
|
-
return
|
|
560
|
-
|
|
561
515
|
placement_group_models = []
|
|
562
516
|
placement_group_model = None
|
|
563
517
|
if instance.fleet_id:
|
|
@@ -595,15 +549,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
595
549
|
exclude_not_available=True,
|
|
596
550
|
)
|
|
597
551
|
|
|
598
|
-
if not offers and should_retry:
|
|
599
|
-
instance.last_retry_at = get_current_datetime()
|
|
600
|
-
logger.debug(
|
|
601
|
-
"No offers for instance %s. Next retry",
|
|
602
|
-
instance.name,
|
|
603
|
-
extra={"instance_name": instance.name},
|
|
604
|
-
)
|
|
605
|
-
return
|
|
606
|
-
|
|
607
552
|
# Limit number of offers tried to prevent long-running processing
|
|
608
553
|
# in case all offers fail.
|
|
609
554
|
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
@@ -681,7 +626,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
681
626
|
instance.offer = instance_offer.json()
|
|
682
627
|
instance.total_blocks = instance_offer.total_blocks
|
|
683
628
|
instance.started_at = get_current_datetime()
|
|
684
|
-
instance.last_retry_at = get_current_datetime()
|
|
685
629
|
|
|
686
630
|
logger.info(
|
|
687
631
|
"Created instance %s",
|
|
@@ -702,21 +646,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
702
646
|
)
|
|
703
647
|
return
|
|
704
648
|
|
|
705
|
-
instance
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
if sibling_instance.id == instance.id:
|
|
718
|
-
continue
|
|
719
|
-
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
649
|
+
_mark_terminated(instance, "All offers failed" if offers else "No offers found")
|
|
650
|
+
if (
|
|
651
|
+
instance.fleet
|
|
652
|
+
and _is_fleet_master_instance(instance)
|
|
653
|
+
and _is_cloud_cluster(instance.fleet)
|
|
654
|
+
):
|
|
655
|
+
# Do not attempt to deploy other instances, as they won't determine the correct cluster
|
|
656
|
+
# backend, region, and placement group without a successfully deployed master instance
|
|
657
|
+
for sibling_instance in instance.fleet.instances:
|
|
658
|
+
if sibling_instance.id == instance.id:
|
|
659
|
+
continue
|
|
660
|
+
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
720
661
|
|
|
721
662
|
|
|
722
663
|
def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
|
|
@@ -41,6 +41,7 @@ from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, Vol
|
|
|
41
41
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
42
42
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
43
43
|
from dstack._internal.server.models import (
|
|
44
|
+
FleetModel,
|
|
44
45
|
InstanceModel,
|
|
45
46
|
JobModel,
|
|
46
47
|
ProbeModel,
|
|
@@ -151,6 +152,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
151
152
|
.options(joinedload(RunModel.project))
|
|
152
153
|
.options(joinedload(RunModel.user))
|
|
153
154
|
.options(joinedload(RunModel.repo))
|
|
155
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
154
156
|
.options(joinedload(RunModel.jobs))
|
|
155
157
|
)
|
|
156
158
|
run_model = res.unique().scalar_one()
|
|
@@ -21,6 +21,7 @@ from dstack._internal.core.models.runs import (
|
|
|
21
21
|
)
|
|
22
22
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
23
23
|
from dstack._internal.server.models import (
|
|
24
|
+
FleetModel,
|
|
24
25
|
InstanceModel,
|
|
25
26
|
JobModel,
|
|
26
27
|
ProjectModel,
|
|
@@ -145,6 +146,7 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
|
|
|
145
146
|
.execution_options(populate_existing=True)
|
|
146
147
|
.options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
|
|
147
148
|
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
149
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
148
150
|
.options(
|
|
149
151
|
selectinload(RunModel.jobs)
|
|
150
152
|
.joinedload(JobModel.instance)
|