dstack 0.19.26__py3-none-any.whl → 0.19.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (68) hide show
  1. dstack/_internal/cli/commands/init.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +114 -16
  3. dstack/_internal/cli/services/repos.py +1 -18
  4. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  5. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  6. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  7. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  8. dstack/_internal/core/backends/aws/compute.py +6 -1
  9. dstack/_internal/core/backends/base/compute.py +33 -5
  10. dstack/_internal/core/backends/base/offers.py +2 -0
  11. dstack/_internal/core/backends/configurators.py +15 -0
  12. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  13. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  14. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  15. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  16. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  17. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  18. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  19. dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
  20. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  21. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  22. dstack/_internal/core/backends/gcp/compute.py +32 -8
  23. dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
  24. dstack/_internal/core/backends/hotaisle/compute.py +1 -6
  25. dstack/_internal/core/backends/models.py +7 -0
  26. dstack/_internal/core/backends/nebius/compute.py +0 -7
  27. dstack/_internal/core/backends/oci/compute.py +4 -5
  28. dstack/_internal/core/backends/vultr/compute.py +1 -5
  29. dstack/_internal/core/compatibility/fleets.py +5 -0
  30. dstack/_internal/core/compatibility/runs.py +8 -1
  31. dstack/_internal/core/models/backends/base.py +5 -1
  32. dstack/_internal/core/models/configurations.py +21 -7
  33. dstack/_internal/core/models/files.py +1 -1
  34. dstack/_internal/core/models/fleets.py +75 -2
  35. dstack/_internal/core/models/runs.py +24 -5
  36. dstack/_internal/core/services/repos.py +85 -80
  37. dstack/_internal/server/background/tasks/process_fleets.py +109 -13
  38. dstack/_internal/server/background/tasks/process_instances.py +12 -71
  39. dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
  40. dstack/_internal/server/background/tasks/process_runs.py +2 -0
  41. dstack/_internal/server/background/tasks/process_submitted_jobs.py +18 -6
  42. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  43. dstack/_internal/server/models.py +5 -2
  44. dstack/_internal/server/schemas/runner.py +1 -0
  45. dstack/_internal/server/services/fleets.py +23 -25
  46. dstack/_internal/server/services/instances.py +3 -3
  47. dstack/_internal/server/services/jobs/configurators/base.py +46 -6
  48. dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
  49. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
  50. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
  51. dstack/_internal/server/services/jobs/configurators/service.py +0 -3
  52. dstack/_internal/server/services/jobs/configurators/task.py +0 -3
  53. dstack/_internal/server/services/runs.py +16 -0
  54. dstack/_internal/server/statics/index.html +1 -1
  55. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
  56. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
  57. dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
  58. dstack/_internal/server/testing/common.py +6 -3
  59. dstack/_internal/utils/path.py +8 -1
  60. dstack/_internal/utils/ssh.py +7 -0
  61. dstack/api/_public/repos.py +41 -6
  62. dstack/api/_public/runs.py +14 -1
  63. dstack/version.py +1 -1
  64. {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
  65. {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/RECORD +68 -53
  66. {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
  67. {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
  68. {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
@@ -34,7 +34,7 @@ STRIP_PREFIX_DEFAULT = True
34
34
  RUN_PRIOTIRY_MIN = 0
35
35
  RUN_PRIOTIRY_MAX = 100
36
36
  RUN_PRIORITY_DEFAULT = 0
37
- DEFAULT_REPO_DIR = "/workflow"
37
+ LEGACY_REPO_DIR = "/workflow"
38
38
  MIN_PROBE_TIMEOUT = 1
39
39
  MIN_PROBE_INTERVAL = 1
40
40
  DEFAULT_PROBE_URL = "/"
@@ -112,8 +112,15 @@ class RepoSpec(CoreModel):
112
112
  Optional[str],
113
113
  Field(description="The commit hash"),
114
114
  ] = None
115
- # Not implemented, has no effect, hidden in the docs
116
- path: str = DEFAULT_REPO_DIR
115
+ path: Annotated[
116
+ Optional[str],
117
+ Field(
118
+ description=(
119
+ "The repo path inside the run container. Relative paths are resolved"
120
+ f" relative to the working directory. Defaults to `{LEGACY_REPO_DIR}`"
121
+ )
122
+ ),
123
+ ] = None
117
124
 
118
125
  @classmethod
119
126
  def parse(cls, v: str) -> Self:
@@ -149,6 +156,14 @@ class RepoSpec(CoreModel):
149
156
  raise ValueError("Either `local_path` or `url` must be specified")
150
157
  return values
151
158
 
159
+ @validator("path")
160
+ def validate_path(cls, v: Optional[str]) -> Optional[str]:
161
+ if v is None:
162
+ return v
163
+ if v.startswith("~") and PurePosixPath(v).parts[0] != "~":
164
+ raise ValueError("`~username` syntax is not supported")
165
+ return v
166
+
152
167
 
153
168
  class ScalingSpec(CoreModel):
154
169
  metric: Annotated[
@@ -380,7 +395,7 @@ class BaseRunConfiguration(CoreModel):
380
395
  Field(
381
396
  description=(
382
397
  "The user inside the container, `user_name_or_id[:group_name_or_id]`"
383
- " (e.g., `ubuntu`, `1000:1000`). Defaults to the default `image` user"
398
+ " (e.g., `ubuntu`, `1000:1000`). Defaults to the default user from the `image`"
384
399
  )
385
400
  ),
386
401
  ] = None
@@ -390,9 +405,8 @@ class BaseRunConfiguration(CoreModel):
390
405
  Optional[str],
391
406
  Field(
392
407
  description=(
393
- "The path to the working directory inside the container."
394
- f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
395
- ' Defaults to `"."` '
408
+ "The absolute path to the working directory inside the container."
409
+ f" Defaults to `{LEGACY_REPO_DIR}`"
396
410
  )
397
411
  ),
398
412
  ] = None
@@ -28,7 +28,7 @@ class FilePathMapping(CoreModel):
28
28
  Field(
29
29
  description=(
30
30
  "The path in the container. Relative paths are resolved relative to"
31
- " the repo directory"
31
+ " the working directory"
32
32
  )
33
33
  ),
34
34
  ]
@@ -19,7 +19,7 @@ from dstack._internal.core.models.profiles import (
19
19
  TerminationPolicy,
20
20
  parse_idle_duration,
21
21
  )
22
- from dstack._internal.core.models.resources import Range, ResourcesSpec
22
+ from dstack._internal.core.models.resources import ResourcesSpec
23
23
  from dstack._internal.utils.common import list_enum_values_for_annotation
24
24
  from dstack._internal.utils.json_schema import add_extra_schema_types
25
25
  from dstack._internal.utils.tags import tags_validator
@@ -141,6 +141,67 @@ class SSHParams(CoreModel):
141
141
  return value
142
142
 
143
143
 
144
+ class FleetNodesSpec(CoreModel):
145
+ min: Annotated[
146
+ int, Field(description=("The minimum number of instances to maintain in the fleet"))
147
+ ]
148
+ target: Annotated[
149
+ int,
150
+ Field(
151
+ description=(
152
+ "The number of instances to provision on fleet apply. `min` <= `target` <= `max`"
153
+ " Defaults to `min`"
154
+ )
155
+ ),
156
+ ]
157
+ max: Annotated[
158
+ Optional[int],
159
+ Field(
160
+ description=(
161
+ "The maximum number of instances allowed in the fleet. Unlimited if not specified"
162
+ )
163
+ ),
164
+ ] = None
165
+
166
+ def dict(self, *args, **kwargs) -> Dict:
167
+ # super() does not work with pydantic-duality
168
+ res = CoreModel.dict(self, *args, **kwargs)
169
+ # For backward compatibility with old clients
170
+ # that do not ignore extra fields due to https://github.com/dstackai/dstack/issues/3066
171
+ if "target" in res and res["target"] == res["min"]:
172
+ del res["target"]
173
+ return res
174
+
175
+ @root_validator(pre=True)
176
+ def set_min_and_target_defaults(cls, values):
177
+ min_ = values.get("min")
178
+ target = values.get("target")
179
+ if min_ is None:
180
+ values["min"] = 0
181
+ if target is None:
182
+ values["target"] = values["min"]
183
+ return values
184
+
185
+ @validator("min")
186
+ def validate_min(cls, v: int) -> int:
187
+ if v < 0:
188
+ raise ValueError("min cannot be negative")
189
+ return v
190
+
191
+ @root_validator(skip_on_failure=True)
192
+ def _post_validate_ranges(cls, values):
193
+ min_ = values["min"]
194
+ target = values["target"]
195
+ max_ = values.get("max")
196
+ if target < min_:
197
+ raise ValueError("target must not be be less than min")
198
+ if max_ is not None and max_ < min_:
199
+ raise ValueError("max must not be less than min")
200
+ if max_ is not None and max_ < target:
201
+ raise ValueError("max must not be less than target")
202
+ return values
203
+
204
+
144
205
  class InstanceGroupParams(CoreModel):
145
206
  env: Annotated[
146
207
  Env,
@@ -151,7 +212,9 @@ class InstanceGroupParams(CoreModel):
151
212
  Field(description="The parameters for adding instances via SSH"),
152
213
  ] = None
153
214
 
154
- nodes: Annotated[Optional[Range[int]], Field(description="The number of instances")] = None
215
+ nodes: Annotated[
216
+ Optional[FleetNodesSpec], Field(description="The number of instances in cloud fleet")
217
+ ] = None
155
218
  placement: Annotated[
156
219
  Optional[InstanceGroupPlacement],
157
220
  Field(description="The placement of instances: `any` or `cluster`"),
@@ -248,6 +311,16 @@ class InstanceGroupParams(CoreModel):
248
311
  extra_types=[{"type": "string"}],
249
312
  )
250
313
 
314
+ @validator("nodes", pre=True)
315
+ def parse_nodes(cls, v: Optional[Union[dict, str]]) -> Optional[dict]:
316
+ if isinstance(v, str) and ".." in v:
317
+ v = v.replace(" ", "")
318
+ min, max = v.split("..")
319
+ return dict(min=min or None, max=max or None)
320
+ elif isinstance(v, str) or isinstance(v, int):
321
+ return dict(min=v, max=v)
322
+ return v
323
+
251
324
  _validate_idle_duration = validator("idle_duration", pre=True, allow_reuse=True)(
252
325
  parse_idle_duration
253
326
  )
@@ -10,7 +10,7 @@ from dstack._internal.core.models.backends.base import BackendType
10
10
  from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
11
11
  from dstack._internal.core.models.configurations import (
12
12
  DEFAULT_PROBE_METHOD,
13
- DEFAULT_REPO_DIR,
13
+ LEGACY_REPO_DIR,
14
14
  AnyRunConfiguration,
15
15
  HTTPHeaderSpec,
16
16
  HTTPMethod,
@@ -259,6 +259,7 @@ class JobSpec(CoreModel):
259
259
  retry: Optional[Retry]
260
260
  volumes: Optional[List[MountPoint]] = None
261
261
  ssh_key: Optional[JobSSHKey] = None
262
+ # `working_dir` is always absolute (if not None) since 0.19.27
262
263
  working_dir: Optional[str]
263
264
  # `repo_data` is optional for client compatibility with pre-0.19.17 servers and for compatibility
264
265
  # with jobs submitted before 0.19.17. All new jobs are expected to have non-None `repo_data`.
@@ -268,6 +269,8 @@ class JobSpec(CoreModel):
268
269
  # submitted before 0.19.17. See `_get_repo_code_hash` on how to get the correct `repo_code_hash`
269
270
  # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
270
271
  repo_code_hash: Optional[str] = None
272
+ # `repo_dir` was added in 0.19.27. Default value is set for backward compatibility
273
+ repo_dir: str = LEGACY_REPO_DIR
271
274
  file_archives: list[FileArchiveMapping] = []
272
275
  # None for non-services and pre-0.19.19 services. See `get_service_port`
273
276
  service_port: Optional[int] = None
@@ -409,17 +412,27 @@ class RunSpec(CoreModel):
409
412
  Optional[str],
410
413
  Field(description="The hash of the repo diff. Can be omitted if there is no repo diff."),
411
414
  ] = None
415
+ repo_dir: Annotated[
416
+ Optional[str],
417
+ Field(
418
+ description=(
419
+ "The repo path inside the container. Relative paths are resolved"
420
+ f" relative to the working directory. Defaults to `{LEGACY_REPO_DIR}`."
421
+ )
422
+ ),
423
+ ] = None
412
424
  file_archives: Annotated[
413
425
  list[FileArchiveMapping],
414
- Field(description="The list of file archive ID to container path mappings"),
426
+ Field(description="The list of file archive ID to container path mappings."),
415
427
  ] = []
428
+ # Server uses configuration.working_dir instead of this field since 0.19.27, but
429
+ # the field still exists for compatibility with older servers
416
430
  working_dir: Annotated[
417
431
  Optional[str],
418
432
  Field(
419
433
  description=(
420
- "The path to the working directory inside the container."
421
- f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
422
- ' Defaults to `"."`.'
434
+ "The absolute path to the working directory inside the container."
435
+ " Defaults to the default working directory from the `image`."
423
436
  )
424
437
  ),
425
438
  ] = None
@@ -506,10 +519,16 @@ class RunStatus(str, Enum):
506
519
  return self in self.finished_statuses()
507
520
 
508
521
 
522
+ class RunFleet(CoreModel):
523
+ id: UUID4
524
+ name: str
525
+
526
+
509
527
  class Run(CoreModel):
510
528
  id: UUID4
511
529
  project_name: str
512
530
  user: str
531
+ fleet: Optional[RunFleet] = None
513
532
  submitted_at: datetime
514
533
  last_processed_at: datetime
515
534
  status: RunStatus
@@ -1,9 +1,10 @@
1
1
  import os
2
+ from contextlib import suppress
2
3
  from pathlib import Path
4
+ from tempfile import NamedTemporaryFile
3
5
  from typing import Optional, Union
4
6
 
5
7
  import git.cmd
6
- import requests
7
8
  import yaml
8
9
  from git.exc import GitCommandError
9
10
 
@@ -13,135 +14,139 @@ from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepo
13
14
  from dstack._internal.core.models.repos.remote import GitRepoURL
14
15
  from dstack._internal.utils.logging import get_logger
15
16
  from dstack._internal.utils.path import PathLike
16
- from dstack._internal.utils.ssh import (
17
- get_host_config,
18
- make_ssh_command_for_git,
19
- try_ssh_key_passphrase,
20
- )
17
+ from dstack._internal.utils.ssh import get_host_config, make_git_env, try_ssh_key_passphrase
21
18
 
22
19
  logger = get_logger(__name__)
23
20
 
24
21
  gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml")
25
22
  default_ssh_key = os.path.expanduser("~/.ssh/id_rsa")
26
23
 
27
- no_prompt_env = dict(GIT_TERMINAL_PROMPT="0")
28
-
29
24
 
30
25
  class InvalidRepoCredentialsError(DstackError):
31
26
  pass
32
27
 
33
28
 
34
- def get_local_repo_credentials(
29
+ def get_repo_creds_and_default_branch(
35
30
  repo_url: str,
36
31
  identity_file: Optional[PathLike] = None,
32
+ private_key: Optional[str] = None,
37
33
  oauth_token: Optional[str] = None,
38
- ) -> RemoteRepoCreds:
34
+ ) -> tuple[RemoteRepoCreds, Optional[str]]:
39
35
  url = GitRepoURL.parse(repo_url, get_ssh_config=get_host_config)
40
36
 
41
37
  # no auth
42
- r = requests.get(f"{url.as_https()}/info/refs?service=git-upload-pack", timeout=10)
43
- if r.status_code == 200:
44
- return RemoteRepoCreds(
45
- clone_url=url.as_https(),
46
- private_key=None,
47
- oauth_token=None,
48
- )
49
-
50
- # user-provided ssh key
51
- if identity_file is not None:
52
- identity_file = os.path.expanduser(identity_file)
53
- return check_remote_repo_credentials_ssh(url, identity_file)
54
-
55
- # user-provided oauth token
38
+ with suppress(InvalidRepoCredentialsError):
39
+ return _get_repo_creds_and_default_branch_https(url)
40
+
41
+ # ssh key provided by the user or pulled from the server
42
+ if identity_file is not None or private_key is not None:
43
+ if identity_file is not None:
44
+ private_key = _read_private_key(identity_file)
45
+ return _get_repo_creds_and_default_branch_ssh(url, identity_file, private_key)
46
+ elif private_key is not None:
47
+ with NamedTemporaryFile("w+", 0o600) as f:
48
+ f.write(private_key)
49
+ f.flush()
50
+ return _get_repo_creds_and_default_branch_ssh(url, f.name, private_key)
51
+ else:
52
+ assert False, "should not reach here"
53
+
54
+ # oauth token provided by the user or pulled from the server
56
55
  if oauth_token is not None:
57
- return check_remote_repo_credentials_https(url, oauth_token)
56
+ return _get_repo_creds_and_default_branch_https(url, oauth_token)
58
57
 
59
58
  # key from ssh config
60
59
  identities = get_host_config(url.original_host).get("identityfile")
61
60
  if identities:
62
- return check_remote_repo_credentials_ssh(url, identities[0])
61
+ _identity_file = identities[0]
62
+ with suppress(InvalidRepoCredentialsError):
63
+ _private_key = _read_private_key(_identity_file)
64
+ return _get_repo_creds_and_default_branch_ssh(url, _identity_file, _private_key)
63
65
 
64
66
  # token from gh config
65
67
  if os.path.exists(gh_config_path):
66
68
  with open(gh_config_path, "r") as f:
67
69
  gh_hosts = yaml.load(f, Loader=yaml.FullLoader)
68
- oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
69
- if oauth_token is not None:
70
- try:
71
- return check_remote_repo_credentials_https(url, oauth_token)
72
- except InvalidRepoCredentialsError:
73
- pass
70
+ _oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
71
+ if _oauth_token is not None:
72
+ with suppress(InvalidRepoCredentialsError):
73
+ return _get_repo_creds_and_default_branch_https(url, _oauth_token)
74
74
 
75
75
  # default user key
76
76
  if os.path.exists(default_ssh_key):
77
- try:
78
- return check_remote_repo_credentials_ssh(url, default_ssh_key)
79
- except InvalidRepoCredentialsError:
80
- pass
77
+ with suppress(InvalidRepoCredentialsError):
78
+ _private_key = _read_private_key(default_ssh_key)
79
+ return _get_repo_creds_and_default_branch_ssh(url, default_ssh_key, _private_key)
81
80
 
82
81
  raise InvalidRepoCredentialsError(
83
82
  "No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
84
83
  )
85
84
 
86
85
 
87
- def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> RemoteRepoCreds:
86
+ def _get_repo_creds_and_default_branch_ssh(
87
+ url: GitRepoURL, identity_file: PathLike, private_key: str
88
+ ) -> tuple[RemoteRepoCreds, Optional[str]]:
89
+ _url = url.as_ssh()
88
90
  try:
89
- git.cmd.Git().ls_remote(url.as_https(oauth_token), env=no_prompt_env)
90
- except GitCommandError:
91
- masked = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
92
- raise InvalidRepoCredentialsError(
93
- f"Can't access `{url.as_https()}` using the `{masked}` token"
94
- )
95
- return RemoteRepoCreds(
96
- clone_url=url.as_https(),
97
- oauth_token=oauth_token,
91
+ default_branch = _get_repo_default_branch(_url, make_git_env(identity_file=identity_file))
92
+ except GitCommandError as e:
93
+ message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
94
+ raise InvalidRepoCredentialsError(message) from e
95
+ creds = RemoteRepoCreds(
96
+ clone_url=_url,
97
+ private_key=private_key,
98
+ oauth_token=None,
99
+ )
100
+ return creds, default_branch
101
+
102
+
103
+ def _get_repo_creds_and_default_branch_https(
104
+ url: GitRepoURL, oauth_token: Optional[str] = None
105
+ ) -> tuple[RemoteRepoCreds, Optional[str]]:
106
+ _url = url.as_https()
107
+ try:
108
+ default_branch = _get_repo_default_branch(url.as_https(oauth_token), make_git_env())
109
+ except GitCommandError as e:
110
+ message = f"Cannot access `{_url}`"
111
+ if oauth_token is not None:
112
+ masked_token = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
113
+ message = f"{message} using the `{masked_token}` token"
114
+ raise InvalidRepoCredentialsError(message) from e
115
+ creds = RemoteRepoCreds(
116
+ clone_url=_url,
98
117
  private_key=None,
118
+ oauth_token=oauth_token,
99
119
  )
120
+ return creds, default_branch
100
121
 
101
122
 
102
- def check_remote_repo_credentials_ssh(url: GitRepoURL, identity_file: PathLike) -> RemoteRepoCreds:
123
+ def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
124
+ # output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
125
+ output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
126
+ for line in output.splitlines():
127
+ # line format: `<oid> TAB <ref> LF`
128
+ oid, _, ref = line.partition("\t")
129
+ if oid.startswith("ref:") and ref == "HEAD":
130
+ return oid.rsplit("/", maxsplit=1)[-1]
131
+ return None
132
+
133
+
134
+ def _read_private_key(identity_file: PathLike) -> str:
135
+ identity_file = Path(identity_file).expanduser().resolve()
103
136
  if not Path(identity_file).exists():
104
- raise InvalidRepoCredentialsError(f"The {identity_file} private SSH key doesn't exist")
137
+ raise InvalidRepoCredentialsError(f"The `{identity_file}` private SSH key doesn't exist")
105
138
  if not os.access(identity_file, os.R_OK):
106
- raise InvalidRepoCredentialsError(f"Can't access the {identity_file} private SSH key")
139
+ raise InvalidRepoCredentialsError(f"Cannot access the `{identity_file}` private SSH key")
107
140
  if not try_ssh_key_passphrase(identity_file):
108
141
  raise InvalidRepoCredentialsError(
109
142
  f"Cannot use the `{identity_file}` private SSH key. "
110
143
  "Ensure that it is valid and passphrase-free"
111
144
  )
112
- with open(identity_file, "r") as f:
113
- private_key = f.read()
114
-
115
- try:
116
- git.cmd.Git().ls_remote(
117
- url.as_ssh(), env=dict(GIT_SSH_COMMAND=make_ssh_command_for_git(identity_file))
118
- )
119
- except GitCommandError:
120
- raise InvalidRepoCredentialsError(
121
- f"Can't access `{url.as_ssh()}` using the `{identity_file}` private SSH key"
122
- )
123
-
124
- return RemoteRepoCreds(
125
- clone_url=url.as_ssh(),
126
- private_key=private_key,
127
- oauth_token=None,
128
- )
129
-
130
-
131
- def get_default_branch(remote_url: str) -> Optional[str]:
132
- """
133
- Get the default branch of a remote Git repository.
134
- """
135
- try:
136
- output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD", env=no_prompt_env)
137
- for line in output.splitlines():
138
- if line.startswith("ref:"):
139
- return line.split()[1].split("/")[-1]
140
- except Exception as e:
141
- logger.debug("Failed to get remote repo default branch: %s", repr(e))
142
- return None
145
+ with open(identity_file, "r") as file:
146
+ return file.read()
143
147
 
144
148
 
149
+ # Used for `config.yml` only, remove it with `repos` in `config.yml`
145
150
  def load_repo(config: RepoConfig) -> Union[RemoteRepo, LocalRepo]:
146
151
  if config.repo_type == "remote":
147
152
  return RemoteRepo(repo_id=config.repo_id, local_repo_dir=config.path)
@@ -1,11 +1,13 @@
1
1
  from datetime import timedelta
2
2
  from typing import List
3
+ from uuid import UUID
3
4
 
4
5
  from sqlalchemy import select, update
5
6
  from sqlalchemy.ext.asyncio import AsyncSession
6
7
  from sqlalchemy.orm import joinedload, load_only
7
8
 
8
- from dstack._internal.core.models.fleets import FleetStatus
9
+ from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
10
+ from dstack._internal.core.models.instances import InstanceStatus
9
11
  from dstack._internal.server.db import get_db, get_session_ctx
10
12
  from dstack._internal.server.models import (
11
13
  FleetModel,
@@ -15,7 +17,9 @@ from dstack._internal.server.models import (
15
17
  RunModel,
16
18
  )
17
19
  from dstack._internal.server.services.fleets import (
20
+ create_fleet_instance_model,
18
21
  get_fleet_spec,
22
+ get_next_instance_num,
19
23
  is_fleet_empty,
20
24
  is_fleet_in_use,
21
25
  )
@@ -65,31 +69,111 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
65
69
  res = await session.execute(
66
70
  select(FleetModel)
67
71
  .where(FleetModel.id.in_(fleet_ids))
68
- .options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
69
72
  .options(
70
- joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
73
+ joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
74
+ joinedload(FleetModel.project),
71
75
  )
72
76
  .options(joinedload(FleetModel.runs).load_only(RunModel.status))
73
77
  .execution_options(populate_existing=True)
74
78
  )
75
79
  fleet_models = list(res.unique().scalars().all())
76
80
 
81
+ # TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
77
82
  deleted_fleets_ids = []
78
- now = get_current_datetime()
79
83
  for fleet_model in fleet_models:
84
+ _consolidate_fleet_state_with_spec(session, fleet_model)
80
85
  deleted = _autodelete_fleet(fleet_model)
81
86
  if deleted:
82
87
  deleted_fleets_ids.append(fleet_model.id)
83
- fleet_model.last_processed_at = now
88
+ fleet_model.last_processed_at = get_current_datetime()
89
+ await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
90
+ await session.commit()
84
91
 
85
- await session.execute(
86
- update(PlacementGroupModel)
87
- .where(
88
- PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
92
+
93
+ def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
94
+ if fleet_model.status == FleetStatus.TERMINATING:
95
+ return
96
+ fleet_spec = get_fleet_spec(fleet_model)
97
+ if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
98
+ # Only explicitly created cloud fleets are consolidated.
99
+ return
100
+ if not _is_fleet_ready_for_consolidation(fleet_model):
101
+ return
102
+ added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
103
+ if added_instances:
104
+ fleet_model.consolidation_attempt += 1
105
+ else:
106
+ # The fleet is already consolidated or consolidation is in progress.
107
+ # We reset consolidation_attempt in both cases for simplicity.
108
+ # The second case does not need reset but is ok to do since
109
+ # it means consolidation is longer than delay, so it won't happen too often.
110
+ # TODO: Reset consolidation_attempt on fleet in-place update.
111
+ fleet_model.consolidation_attempt = 0
112
+ fleet_model.last_consolidated_at = get_current_datetime()
113
+
114
+
115
+ def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
116
+ consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
117
+ last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
118
+ duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
119
+ return duration_since_last_consolidation >= consolidation_retry_delay
120
+
121
+
122
+ # We use exponentially increasing consolidation retry delays so that
123
+ # consolidation does not happen too often. In particular, this prevents
124
+ # retrying instance provisioning constantly in case of no offers.
125
+ # TODO: Adjust delays.
126
+ _CONSOLIDATION_RETRY_DELAYS = [
127
+ timedelta(seconds=30),
128
+ timedelta(minutes=1),
129
+ timedelta(minutes=2),
130
+ timedelta(minutes=5),
131
+ timedelta(minutes=10),
132
+ ]
133
+
134
+
135
+ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
136
+ if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
137
+ return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
138
+ return _CONSOLIDATION_RETRY_DELAYS[-1]
139
+
140
+
141
+ def _maintain_fleet_nodes_min(
142
+ session: AsyncSession,
143
+ fleet_model: FleetModel,
144
+ fleet_spec: FleetSpec,
145
+ ) -> bool:
146
+ """
147
+ Ensures the fleet has at least `nodes.min` instances.
148
+ Returns `True` if retried or added new instances and `False` otherwise.
149
+ """
150
+ assert fleet_spec.configuration.nodes is not None
151
+ for instance in fleet_model.instances:
152
+ # Delete terminated but not deleted instances since
153
+ # they are going to be replaced with new pending instances.
154
+ if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
155
+ # It's safe to modify instances without instance lock since
156
+ # no other task modifies already terminated instances.
157
+ instance.deleted = True
158
+ instance.deleted_at = get_current_datetime()
159
+ active_instances = [i for i in fleet_model.instances if not i.deleted]
160
+ active_instances_num = len(active_instances)
161
+ if active_instances_num >= fleet_spec.configuration.nodes.min:
162
+ return False
163
+ nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
164
+ for i in range(nodes_missing):
165
+ instance_model = create_fleet_instance_model(
166
+ session=session,
167
+ project=fleet_model.project,
168
+ # TODO: Store fleet.user and pass it instead of the project owner.
169
+ username=fleet_model.project.owner.name,
170
+ spec=fleet_spec,
171
+ instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
89
172
  )
90
- .values(fleet_deleted=True)
91
- )
92
- await session.commit()
173
+ active_instances.append(instance_model)
174
+ fleet_model.instances.append(instance_model)
175
+ logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
176
+ return True
93
177
 
94
178
 
95
179
  def _autodelete_fleet(fleet_model: FleetModel) -> bool:
@@ -100,7 +184,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
100
184
  if (
101
185
  fleet_model.status != FleetStatus.TERMINATING
102
186
  and fleet_spec.configuration.nodes is not None
103
- and (fleet_spec.configuration.nodes.min is None or fleet_spec.configuration.nodes.min == 0)
187
+ and fleet_spec.configuration.nodes.min == 0
104
188
  ):
105
189
  # Empty fleets that allow 0 nodes should not be auto-deleted
106
190
  return False
@@ -110,3 +194,15 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
110
194
  fleet_model.deleted = True
111
195
  logger.info("Fleet %s deleted", fleet_model.name)
112
196
  return True
197
+
198
+
199
+ async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
200
+ if len(fleets_ids) == 0:
201
+ return
202
+ await session.execute(
203
+ update(PlacementGroupModel)
204
+ .where(
205
+ PlacementGroupModel.fleet_id.in_(fleets_ids),
206
+ )
207
+ .values(fleet_deleted=True)
208
+ )