dstack 0.19.26__py3-none-any.whl → 0.19.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/__init__.py +11 -8
  2. dstack/_internal/cli/commands/apply.py +6 -3
  3. dstack/_internal/cli/commands/completion.py +3 -1
  4. dstack/_internal/cli/commands/config.py +1 -0
  5. dstack/_internal/cli/commands/init.py +4 -4
  6. dstack/_internal/cli/commands/offer.py +1 -1
  7. dstack/_internal/cli/commands/project.py +1 -0
  8. dstack/_internal/cli/commands/server.py +2 -2
  9. dstack/_internal/cli/main.py +1 -1
  10. dstack/_internal/cli/services/configurators/base.py +2 -4
  11. dstack/_internal/cli/services/configurators/fleet.py +4 -5
  12. dstack/_internal/cli/services/configurators/gateway.py +3 -5
  13. dstack/_internal/cli/services/configurators/run.py +165 -43
  14. dstack/_internal/cli/services/configurators/volume.py +3 -5
  15. dstack/_internal/cli/services/repos.py +1 -18
  16. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  17. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  18. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  19. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  20. dstack/_internal/core/backends/aws/compute.py +6 -1
  21. dstack/_internal/core/backends/base/compute.py +33 -5
  22. dstack/_internal/core/backends/base/offers.py +2 -0
  23. dstack/_internal/core/backends/configurators.py +15 -0
  24. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  25. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  26. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  27. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  28. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  29. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  30. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  31. dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
  32. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  33. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  34. dstack/_internal/core/backends/gcp/compute.py +32 -8
  35. dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
  36. dstack/_internal/core/backends/hotaisle/compute.py +1 -6
  37. dstack/_internal/core/backends/models.py +7 -0
  38. dstack/_internal/core/backends/nebius/compute.py +0 -7
  39. dstack/_internal/core/backends/oci/compute.py +4 -5
  40. dstack/_internal/core/backends/vultr/compute.py +1 -5
  41. dstack/_internal/core/compatibility/fleets.py +5 -0
  42. dstack/_internal/core/compatibility/runs.py +10 -1
  43. dstack/_internal/core/models/backends/base.py +5 -1
  44. dstack/_internal/core/models/common.py +67 -43
  45. dstack/_internal/core/models/configurations.py +109 -69
  46. dstack/_internal/core/models/files.py +1 -1
  47. dstack/_internal/core/models/fleets.py +115 -25
  48. dstack/_internal/core/models/instances.py +5 -5
  49. dstack/_internal/core/models/profiles.py +66 -47
  50. dstack/_internal/core/models/repos/remote.py +21 -16
  51. dstack/_internal/core/models/resources.py +69 -65
  52. dstack/_internal/core/models/runs.py +41 -14
  53. dstack/_internal/core/services/repos.py +85 -80
  54. dstack/_internal/server/app.py +5 -0
  55. dstack/_internal/server/background/tasks/process_fleets.py +117 -13
  56. dstack/_internal/server/background/tasks/process_instances.py +12 -71
  57. dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
  58. dstack/_internal/server/background/tasks/process_runs.py +2 -0
  59. dstack/_internal/server/background/tasks/process_submitted_jobs.py +48 -16
  60. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  61. dstack/_internal/server/models.py +11 -7
  62. dstack/_internal/server/schemas/gateways.py +10 -9
  63. dstack/_internal/server/schemas/runner.py +1 -0
  64. dstack/_internal/server/services/backends/handlers.py +2 -0
  65. dstack/_internal/server/services/docker.py +8 -7
  66. dstack/_internal/server/services/fleets.py +23 -25
  67. dstack/_internal/server/services/instances.py +3 -3
  68. dstack/_internal/server/services/jobs/configurators/base.py +46 -6
  69. dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
  70. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
  71. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
  72. dstack/_internal/server/services/jobs/configurators/service.py +0 -3
  73. dstack/_internal/server/services/jobs/configurators/task.py +0 -3
  74. dstack/_internal/server/services/projects.py +52 -1
  75. dstack/_internal/server/services/runs.py +16 -0
  76. dstack/_internal/server/settings.py +46 -0
  77. dstack/_internal/server/statics/index.html +1 -1
  78. dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-5e0d56245c4bd241ec27.css} +1 -1
  79. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-a2a16772fbf11a14d191.js} +1215 -998
  80. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-a2a16772fbf11a14d191.js.map} +1 -1
  81. dstack/_internal/server/testing/common.py +6 -3
  82. dstack/_internal/utils/env.py +85 -11
  83. dstack/_internal/utils/path.py +8 -1
  84. dstack/_internal/utils/ssh.py +7 -0
  85. dstack/api/_public/repos.py +41 -6
  86. dstack/api/_public/runs.py +14 -1
  87. dstack/version.py +1 -1
  88. {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/METADATA +2 -2
  89. {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/RECORD +92 -78
  90. dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +0 -3
  91. {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,9 +1,10 @@
1
1
  import os
2
+ from contextlib import suppress
2
3
  from pathlib import Path
4
+ from tempfile import NamedTemporaryFile
3
5
  from typing import Optional, Union
4
6
 
5
7
  import git.cmd
6
- import requests
7
8
  import yaml
8
9
  from git.exc import GitCommandError
9
10
 
@@ -13,135 +14,139 @@ from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepo
13
14
  from dstack._internal.core.models.repos.remote import GitRepoURL
14
15
  from dstack._internal.utils.logging import get_logger
15
16
  from dstack._internal.utils.path import PathLike
16
- from dstack._internal.utils.ssh import (
17
- get_host_config,
18
- make_ssh_command_for_git,
19
- try_ssh_key_passphrase,
20
- )
17
+ from dstack._internal.utils.ssh import get_host_config, make_git_env, try_ssh_key_passphrase
21
18
 
22
19
  logger = get_logger(__name__)
23
20
 
24
21
  gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml")
25
22
  default_ssh_key = os.path.expanduser("~/.ssh/id_rsa")
26
23
 
27
- no_prompt_env = dict(GIT_TERMINAL_PROMPT="0")
28
-
29
24
 
30
25
  class InvalidRepoCredentialsError(DstackError):
31
26
  pass
32
27
 
33
28
 
34
- def get_local_repo_credentials(
29
+ def get_repo_creds_and_default_branch(
35
30
  repo_url: str,
36
31
  identity_file: Optional[PathLike] = None,
32
+ private_key: Optional[str] = None,
37
33
  oauth_token: Optional[str] = None,
38
- ) -> RemoteRepoCreds:
34
+ ) -> tuple[RemoteRepoCreds, Optional[str]]:
39
35
  url = GitRepoURL.parse(repo_url, get_ssh_config=get_host_config)
40
36
 
41
37
  # no auth
42
- r = requests.get(f"{url.as_https()}/info/refs?service=git-upload-pack", timeout=10)
43
- if r.status_code == 200:
44
- return RemoteRepoCreds(
45
- clone_url=url.as_https(),
46
- private_key=None,
47
- oauth_token=None,
48
- )
49
-
50
- # user-provided ssh key
51
- if identity_file is not None:
52
- identity_file = os.path.expanduser(identity_file)
53
- return check_remote_repo_credentials_ssh(url, identity_file)
54
-
55
- # user-provided oauth token
38
+ with suppress(InvalidRepoCredentialsError):
39
+ return _get_repo_creds_and_default_branch_https(url)
40
+
41
+ # ssh key provided by the user or pulled from the server
42
+ if identity_file is not None or private_key is not None:
43
+ if identity_file is not None:
44
+ private_key = _read_private_key(identity_file)
45
+ return _get_repo_creds_and_default_branch_ssh(url, identity_file, private_key)
46
+ elif private_key is not None:
47
+ with NamedTemporaryFile("w+", 0o600) as f:
48
+ f.write(private_key)
49
+ f.flush()
50
+ return _get_repo_creds_and_default_branch_ssh(url, f.name, private_key)
51
+ else:
52
+ assert False, "should not reach here"
53
+
54
+ # oauth token provided by the user or pulled from the server
56
55
  if oauth_token is not None:
57
- return check_remote_repo_credentials_https(url, oauth_token)
56
+ return _get_repo_creds_and_default_branch_https(url, oauth_token)
58
57
 
59
58
  # key from ssh config
60
59
  identities = get_host_config(url.original_host).get("identityfile")
61
60
  if identities:
62
- return check_remote_repo_credentials_ssh(url, identities[0])
61
+ _identity_file = identities[0]
62
+ with suppress(InvalidRepoCredentialsError):
63
+ _private_key = _read_private_key(_identity_file)
64
+ return _get_repo_creds_and_default_branch_ssh(url, _identity_file, _private_key)
63
65
 
64
66
  # token from gh config
65
67
  if os.path.exists(gh_config_path):
66
68
  with open(gh_config_path, "r") as f:
67
69
  gh_hosts = yaml.load(f, Loader=yaml.FullLoader)
68
- oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
69
- if oauth_token is not None:
70
- try:
71
- return check_remote_repo_credentials_https(url, oauth_token)
72
- except InvalidRepoCredentialsError:
73
- pass
70
+ _oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
71
+ if _oauth_token is not None:
72
+ with suppress(InvalidRepoCredentialsError):
73
+ return _get_repo_creds_and_default_branch_https(url, _oauth_token)
74
74
 
75
75
  # default user key
76
76
  if os.path.exists(default_ssh_key):
77
- try:
78
- return check_remote_repo_credentials_ssh(url, default_ssh_key)
79
- except InvalidRepoCredentialsError:
80
- pass
77
+ with suppress(InvalidRepoCredentialsError):
78
+ _private_key = _read_private_key(default_ssh_key)
79
+ return _get_repo_creds_and_default_branch_ssh(url, default_ssh_key, _private_key)
81
80
 
82
81
  raise InvalidRepoCredentialsError(
83
82
  "No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
84
83
  )
85
84
 
86
85
 
87
- def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> RemoteRepoCreds:
86
+ def _get_repo_creds_and_default_branch_ssh(
87
+ url: GitRepoURL, identity_file: PathLike, private_key: str
88
+ ) -> tuple[RemoteRepoCreds, Optional[str]]:
89
+ _url = url.as_ssh()
88
90
  try:
89
- git.cmd.Git().ls_remote(url.as_https(oauth_token), env=no_prompt_env)
90
- except GitCommandError:
91
- masked = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
92
- raise InvalidRepoCredentialsError(
93
- f"Can't access `{url.as_https()}` using the `{masked}` token"
94
- )
95
- return RemoteRepoCreds(
96
- clone_url=url.as_https(),
97
- oauth_token=oauth_token,
91
+ default_branch = _get_repo_default_branch(_url, make_git_env(identity_file=identity_file))
92
+ except GitCommandError as e:
93
+ message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
94
+ raise InvalidRepoCredentialsError(message) from e
95
+ creds = RemoteRepoCreds(
96
+ clone_url=_url,
97
+ private_key=private_key,
98
+ oauth_token=None,
99
+ )
100
+ return creds, default_branch
101
+
102
+
103
+ def _get_repo_creds_and_default_branch_https(
104
+ url: GitRepoURL, oauth_token: Optional[str] = None
105
+ ) -> tuple[RemoteRepoCreds, Optional[str]]:
106
+ _url = url.as_https()
107
+ try:
108
+ default_branch = _get_repo_default_branch(url.as_https(oauth_token), make_git_env())
109
+ except GitCommandError as e:
110
+ message = f"Cannot access `{_url}`"
111
+ if oauth_token is not None:
112
+ masked_token = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
113
+ message = f"{message} using the `{masked_token}` token"
114
+ raise InvalidRepoCredentialsError(message) from e
115
+ creds = RemoteRepoCreds(
116
+ clone_url=_url,
98
117
  private_key=None,
118
+ oauth_token=oauth_token,
99
119
  )
120
+ return creds, default_branch
100
121
 
101
122
 
102
- def check_remote_repo_credentials_ssh(url: GitRepoURL, identity_file: PathLike) -> RemoteRepoCreds:
123
+ def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
124
+ # output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
125
+ output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
126
+ for line in output.splitlines():
127
+ # line format: `<oid> TAB <ref> LF`
128
+ oid, _, ref = line.partition("\t")
129
+ if oid.startswith("ref:") and ref == "HEAD":
130
+ return oid.rsplit("/", maxsplit=1)[-1]
131
+ return None
132
+
133
+
134
+ def _read_private_key(identity_file: PathLike) -> str:
135
+ identity_file = Path(identity_file).expanduser().resolve()
103
136
  if not Path(identity_file).exists():
104
- raise InvalidRepoCredentialsError(f"The {identity_file} private SSH key doesn't exist")
137
+ raise InvalidRepoCredentialsError(f"The `{identity_file}` private SSH key doesn't exist")
105
138
  if not os.access(identity_file, os.R_OK):
106
- raise InvalidRepoCredentialsError(f"Can't access the {identity_file} private SSH key")
139
+ raise InvalidRepoCredentialsError(f"Cannot access the `{identity_file}` private SSH key")
107
140
  if not try_ssh_key_passphrase(identity_file):
108
141
  raise InvalidRepoCredentialsError(
109
142
  f"Cannot use the `{identity_file}` private SSH key. "
110
143
  "Ensure that it is valid and passphrase-free"
111
144
  )
112
- with open(identity_file, "r") as f:
113
- private_key = f.read()
114
-
115
- try:
116
- git.cmd.Git().ls_remote(
117
- url.as_ssh(), env=dict(GIT_SSH_COMMAND=make_ssh_command_for_git(identity_file))
118
- )
119
- except GitCommandError:
120
- raise InvalidRepoCredentialsError(
121
- f"Can't access `{url.as_ssh()}` using the `{identity_file}` private SSH key"
122
- )
123
-
124
- return RemoteRepoCreds(
125
- clone_url=url.as_ssh(),
126
- private_key=private_key,
127
- oauth_token=None,
128
- )
129
-
130
-
131
- def get_default_branch(remote_url: str) -> Optional[str]:
132
- """
133
- Get the default branch of a remote Git repository.
134
- """
135
- try:
136
- output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD", env=no_prompt_env)
137
- for line in output.splitlines():
138
- if line.startswith("ref:"):
139
- return line.split()[1].split("/")[-1]
140
- except Exception as e:
141
- logger.debug("Failed to get remote repo default branch: %s", repr(e))
142
- return None
145
+ with open(identity_file, "r") as file:
146
+ return file.read()
143
147
 
144
148
 
149
+ # Used for `config.yml` only, remove it with `repos` in `config.yml`
145
150
  def load_repo(config: RepoConfig) -> Union[RemoteRepo, LocalRepo]:
146
151
  if config.repo_type == "remote":
147
152
  return RemoteRepo(repo_id=config.repo_id, local_repo_dir=config.path)
@@ -160,6 +160,11 @@ async def lifespan(app: FastAPI):
160
160
  logger.info("Background processing is disabled")
161
161
  PROBES_SCHEDULER.start()
162
162
  dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
163
+ logger.info(
164
+ "Job network mode: %s (%d)",
165
+ settings.JOB_NETWORK_MODE.name,
166
+ settings.JOB_NETWORK_MODE.value,
167
+ )
163
168
  logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
164
169
  logger.info(
165
170
  f"The dstack server {dstack_version} is running at {SERVER_URL}",
@@ -1,11 +1,13 @@
1
1
  from datetime import timedelta
2
2
  from typing import List
3
+ from uuid import UUID
3
4
 
4
5
  from sqlalchemy import select, update
5
6
  from sqlalchemy.ext.asyncio import AsyncSession
6
7
  from sqlalchemy.orm import joinedload, load_only
7
8
 
8
- from dstack._internal.core.models.fleets import FleetStatus
9
+ from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
10
+ from dstack._internal.core.models.instances import InstanceStatus
9
11
  from dstack._internal.server.db import get_db, get_session_ctx
10
12
  from dstack._internal.server.models import (
11
13
  FleetModel,
@@ -15,7 +17,9 @@ from dstack._internal.server.models import (
15
17
  RunModel,
16
18
  )
17
19
  from dstack._internal.server.services.fleets import (
20
+ create_fleet_instance_model,
18
21
  get_fleet_spec,
22
+ get_next_instance_num,
19
23
  is_fleet_empty,
20
24
  is_fleet_in_use,
21
25
  )
@@ -65,34 +69,122 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
65
69
  res = await session.execute(
66
70
  select(FleetModel)
67
71
  .where(FleetModel.id.in_(fleet_ids))
68
- .options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
69
72
  .options(
70
- joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
73
+ joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
74
+ joinedload(FleetModel.project),
71
75
  )
72
76
  .options(joinedload(FleetModel.runs).load_only(RunModel.status))
73
77
  .execution_options(populate_existing=True)
74
78
  )
75
79
  fleet_models = list(res.unique().scalars().all())
76
80
 
81
+ # TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
77
82
  deleted_fleets_ids = []
78
- now = get_current_datetime()
79
83
  for fleet_model in fleet_models:
84
+ _consolidate_fleet_state_with_spec(session, fleet_model)
80
85
  deleted = _autodelete_fleet(fleet_model)
81
86
  if deleted:
82
87
  deleted_fleets_ids.append(fleet_model.id)
83
- fleet_model.last_processed_at = now
88
+ fleet_model.last_processed_at = get_current_datetime()
89
+ await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
90
+ await session.commit()
84
91
 
85
- await session.execute(
86
- update(PlacementGroupModel)
87
- .where(
88
- PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
92
+
93
+ def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
94
+ if fleet_model.status == FleetStatus.TERMINATING:
95
+ return
96
+ fleet_spec = get_fleet_spec(fleet_model)
97
+ if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
98
+ # Only explicitly created cloud fleets are consolidated.
99
+ return
100
+ if not _is_fleet_ready_for_consolidation(fleet_model):
101
+ return
102
+ added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
103
+ if added_instances:
104
+ fleet_model.consolidation_attempt += 1
105
+ else:
106
+ # The fleet is already consolidated or consolidation is in progress.
107
+ # We reset consolidation_attempt in both cases for simplicity.
108
+ # The second case does not need reset but is ok to do since
109
+ # it means consolidation is longer than delay, so it won't happen too often.
110
+ # TODO: Reset consolidation_attempt on fleet in-place update.
111
+ fleet_model.consolidation_attempt = 0
112
+ fleet_model.last_consolidated_at = get_current_datetime()
113
+
114
+
115
+ def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
116
+ consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
117
+ last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
118
+ duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
119
+ return duration_since_last_consolidation >= consolidation_retry_delay
120
+
121
+
122
+ # We use exponentially increasing consolidation retry delays so that
123
+ # consolidation does not happen too often. In particular, this prevents
124
+ # retrying instance provisioning constantly in case of no offers.
125
+ # TODO: Adjust delays.
126
+ _CONSOLIDATION_RETRY_DELAYS = [
127
+ timedelta(seconds=30),
128
+ timedelta(minutes=1),
129
+ timedelta(minutes=2),
130
+ timedelta(minutes=5),
131
+ timedelta(minutes=10),
132
+ ]
133
+
134
+
135
+ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
136
+ if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
137
+ return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
138
+ return _CONSOLIDATION_RETRY_DELAYS[-1]
139
+
140
+
141
+ def _maintain_fleet_nodes_min(
142
+ session: AsyncSession,
143
+ fleet_model: FleetModel,
144
+ fleet_spec: FleetSpec,
145
+ ) -> bool:
146
+ """
147
+ Ensures the fleet has at least `nodes.min` instances.
148
+ Returns `True` if retried or added new instances and `False` otherwise.
149
+ """
150
+ assert fleet_spec.configuration.nodes is not None
151
+ for instance in fleet_model.instances:
152
+ # Delete terminated but not deleted instances since
153
+ # they are going to be replaced with new pending instances.
154
+ if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
155
+ # It's safe to modify instances without instance lock since
156
+ # no other task modifies already terminated instances.
157
+ instance.deleted = True
158
+ instance.deleted_at = get_current_datetime()
159
+ active_instances = [i for i in fleet_model.instances if not i.deleted]
160
+ active_instances_num = len(active_instances)
161
+ if active_instances_num >= fleet_spec.configuration.nodes.min:
162
+ return False
163
+ nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
164
+ for i in range(nodes_missing):
165
+ instance_model = create_fleet_instance_model(
166
+ session=session,
167
+ project=fleet_model.project,
168
+ # TODO: Store fleet.user and pass it instead of the project owner.
169
+ username=fleet_model.project.owner.name,
170
+ spec=fleet_spec,
171
+ instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
89
172
  )
90
- .values(fleet_deleted=True)
91
- )
92
- await session.commit()
173
+ active_instances.append(instance_model)
174
+ fleet_model.instances.append(instance_model)
175
+ logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
176
+ return True
93
177
 
94
178
 
95
179
  def _autodelete_fleet(fleet_model: FleetModel) -> bool:
180
+ if fleet_model.project.deleted:
181
+ # It used to be possible to delete project with active resources:
182
+ # https://github.com/dstackai/dstack/issues/3077
183
+ fleet_model.status = FleetStatus.TERMINATED
184
+ fleet_model.deleted = True
185
+ logger.info("Fleet %s deleted due to deleted project", fleet_model.name)
186
+ return True
187
+
96
188
  if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
97
189
  return False
98
190
 
@@ -100,7 +192,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
100
192
  if (
101
193
  fleet_model.status != FleetStatus.TERMINATING
102
194
  and fleet_spec.configuration.nodes is not None
103
- and (fleet_spec.configuration.nodes.min is None or fleet_spec.configuration.nodes.min == 0)
195
+ and fleet_spec.configuration.nodes.min == 0
104
196
  ):
105
197
  # Empty fleets that allow 0 nodes should not be auto-deleted
106
198
  return False
@@ -110,3 +202,15 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
110
202
  fleet_model.deleted = True
111
203
  logger.info("Fleet %s deleted", fleet_model.name)
112
204
  return True
205
+
206
+
207
+ async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
208
+ if len(fleets_ids) == 0:
209
+ return
210
+ await session.execute(
211
+ update(PlacementGroupModel)
212
+ .where(
213
+ PlacementGroupModel.fleet_id.in_(fleets_ids),
214
+ )
215
+ .values(fleet_deleted=True)
216
+ )
@@ -53,14 +53,12 @@ from dstack._internal.core.models.placement import (
53
53
  PlacementStrategy,
54
54
  )
55
55
  from dstack._internal.core.models.profiles import (
56
- RetryEvent,
57
56
  TerminationPolicy,
58
57
  )
59
58
  from dstack._internal.core.models.runs import (
60
59
  JobProvisioningData,
61
60
  Retry,
62
61
  )
63
- from dstack._internal.core.services.profiles import get_retry
64
62
  from dstack._internal.server import settings as server_settings
65
63
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
66
64
  from dstack._internal.server.db import get_db, get_session_ctx
@@ -327,7 +325,6 @@ async def _add_remote(instance: InstanceModel) -> None:
327
325
  e,
328
326
  )
329
327
  instance.status = InstanceStatus.PENDING
330
- instance.last_retry_at = get_current_datetime()
331
328
  return
332
329
 
333
330
  instance_type = host_info_to_instance_type(host_info, cpu_arch)
@@ -426,7 +423,6 @@ async def _add_remote(instance: InstanceModel) -> None:
426
423
  instance.offer = instance_offer.json()
427
424
  instance.job_provisioning_data = jpd.json()
428
425
  instance.started_at = get_current_datetime()
429
- instance.last_retry_at = get_current_datetime()
430
426
 
431
427
 
432
428
  def _deploy_instance(
@@ -493,29 +489,6 @@ def _deploy_instance(
493
489
 
494
490
 
495
491
  async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
496
- if instance.last_retry_at is not None:
497
- last_retry = instance.last_retry_at
498
- if get_current_datetime() < last_retry + timedelta(minutes=1):
499
- return
500
-
501
- if (
502
- instance.profile is None
503
- or instance.requirements is None
504
- or instance.instance_configuration is None
505
- ):
506
- instance.status = InstanceStatus.TERMINATED
507
- instance.termination_reason = "Empty profile, requirements or instance_configuration"
508
- instance.last_retry_at = get_current_datetime()
509
- logger.warning(
510
- "Empty profile, requirements or instance_configuration. Terminate instance: %s",
511
- instance.name,
512
- extra={
513
- "instance_name": instance.name,
514
- "instance_status": InstanceStatus.TERMINATED.value,
515
- },
516
- )
517
- return
518
-
519
492
  if _need_to_wait_fleet_provisioning(instance):
520
493
  logger.debug("Waiting for the first instance in the fleet to be provisioned")
521
494
  return
@@ -529,7 +502,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
529
502
  instance.termination_reason = (
530
503
  f"Error to parse profile, requirements or instance_configuration: {e}"
531
504
  )
532
- instance.last_retry_at = get_current_datetime()
533
505
  logger.warning(
534
506
  "Error to parse profile, requirements or instance_configuration. Terminate instance: %s",
535
507
  instance.name,
@@ -540,24 +512,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
540
512
  )
541
513
  return
542
514
 
543
- retry = get_retry(profile)
544
- should_retry = retry is not None and RetryEvent.NO_CAPACITY in retry.on_events
545
-
546
- if retry is not None:
547
- retry_duration_deadline = _get_retry_duration_deadline(instance, retry)
548
- if get_current_datetime() > retry_duration_deadline:
549
- instance.status = InstanceStatus.TERMINATED
550
- instance.termination_reason = "Retry duration expired"
551
- logger.warning(
552
- "Retry duration expired. Terminating instance %s",
553
- instance.name,
554
- extra={
555
- "instance_name": instance.name,
556
- "instance_status": InstanceStatus.TERMINATED.value,
557
- },
558
- )
559
- return
560
-
561
515
  placement_group_models = []
562
516
  placement_group_model = None
563
517
  if instance.fleet_id:
@@ -595,15 +549,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
595
549
  exclude_not_available=True,
596
550
  )
597
551
 
598
- if not offers and should_retry:
599
- instance.last_retry_at = get_current_datetime()
600
- logger.debug(
601
- "No offers for instance %s. Next retry",
602
- instance.name,
603
- extra={"instance_name": instance.name},
604
- )
605
- return
606
-
607
552
  # Limit number of offers tried to prevent long-running processing
608
553
  # in case all offers fail.
609
554
  for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
@@ -681,7 +626,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
681
626
  instance.offer = instance_offer.json()
682
627
  instance.total_blocks = instance_offer.total_blocks
683
628
  instance.started_at = get_current_datetime()
684
- instance.last_retry_at = get_current_datetime()
685
629
 
686
630
  logger.info(
687
631
  "Created instance %s",
@@ -702,21 +646,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
702
646
  )
703
647
  return
704
648
 
705
- instance.last_retry_at = get_current_datetime()
706
-
707
- if not should_retry:
708
- _mark_terminated(instance, "All offers failed" if offers else "No offers found")
709
- if (
710
- instance.fleet
711
- and _is_fleet_master_instance(instance)
712
- and _is_cloud_cluster(instance.fleet)
713
- ):
714
- # Do not attempt to deploy other instances, as they won't determine the correct cluster
715
- # backend, region, and placement group without a successfully deployed master instance
716
- for sibling_instance in instance.fleet.instances:
717
- if sibling_instance.id == instance.id:
718
- continue
719
- _mark_terminated(sibling_instance, "Master instance failed to start")
649
+ _mark_terminated(instance, "All offers failed" if offers else "No offers found")
650
+ if (
651
+ instance.fleet
652
+ and _is_fleet_master_instance(instance)
653
+ and _is_cloud_cluster(instance.fleet)
654
+ ):
655
+ # Do not attempt to deploy other instances, as they won't determine the correct cluster
656
+ # backend, region, and placement group without a successfully deployed master instance
657
+ for sibling_instance in instance.fleet.instances:
658
+ if sibling_instance.id == instance.id:
659
+ continue
660
+ _mark_terminated(sibling_instance, "Master instance failed to start")
720
661
 
721
662
 
722
663
  def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
@@ -41,6 +41,7 @@ from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, Vol
41
41
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
42
42
  from dstack._internal.server.db import get_db, get_session_ctx
43
43
  from dstack._internal.server.models import (
44
+ FleetModel,
44
45
  InstanceModel,
45
46
  JobModel,
46
47
  ProbeModel,
@@ -151,6 +152,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
151
152
  .options(joinedload(RunModel.project))
152
153
  .options(joinedload(RunModel.user))
153
154
  .options(joinedload(RunModel.repo))
155
+ .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
154
156
  .options(joinedload(RunModel.jobs))
155
157
  )
156
158
  run_model = res.unique().scalar_one()
@@ -21,6 +21,7 @@ from dstack._internal.core.models.runs import (
21
21
  )
22
22
  from dstack._internal.server.db import get_db, get_session_ctx
23
23
  from dstack._internal.server.models import (
24
+ FleetModel,
24
25
  InstanceModel,
25
26
  JobModel,
26
27
  ProjectModel,
@@ -145,6 +146,7 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
145
146
  .execution_options(populate_existing=True)
146
147
  .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
147
148
  .options(joinedload(RunModel.user).load_only(UserModel.name))
149
+ .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
148
150
  .options(
149
151
  selectinload(RunModel.jobs)
150
152
  .joinedload(JobModel.instance)