dstack 0.19.26__py3-none-any.whl → 0.19.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/init.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +114 -16
- dstack/_internal/cli/services/repos.py +1 -18
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +0 -7
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/runs.py +8 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/configurations.py +21 -7
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +75 -2
- dstack/_internal/core/models/runs.py +24 -5
- dstack/_internal/core/services/repos.py +85 -80
- dstack/_internal/server/background/tasks/process_fleets.py +109 -13
- dstack/_internal/server/background/tasks/process_instances.py +12 -71
- dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
- dstack/_internal/server/background/tasks/process_runs.py +2 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +18 -6
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +5 -2
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/fleets.py +23 -25
- dstack/_internal/server/services/instances.py +3 -3
- dstack/_internal/server/services/jobs/configurators/base.py +46 -6
- dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
- dstack/_internal/server/services/jobs/configurators/service.py +0 -3
- dstack/_internal/server/services/jobs/configurators/task.py +0 -3
- dstack/_internal/server/services/runs.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
- dstack/_internal/server/testing/common.py +6 -3
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/api/_public/repos.py +41 -6
- dstack/api/_public/runs.py +14 -1
- dstack/version.py +1 -1
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/RECORD +68 -53
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -34,7 +34,7 @@ STRIP_PREFIX_DEFAULT = True
|
|
|
34
34
|
RUN_PRIOTIRY_MIN = 0
|
|
35
35
|
RUN_PRIOTIRY_MAX = 100
|
|
36
36
|
RUN_PRIORITY_DEFAULT = 0
|
|
37
|
-
|
|
37
|
+
LEGACY_REPO_DIR = "/workflow"
|
|
38
38
|
MIN_PROBE_TIMEOUT = 1
|
|
39
39
|
MIN_PROBE_INTERVAL = 1
|
|
40
40
|
DEFAULT_PROBE_URL = "/"
|
|
@@ -112,8 +112,15 @@ class RepoSpec(CoreModel):
|
|
|
112
112
|
Optional[str],
|
|
113
113
|
Field(description="The commit hash"),
|
|
114
114
|
] = None
|
|
115
|
-
|
|
116
|
-
|
|
115
|
+
path: Annotated[
|
|
116
|
+
Optional[str],
|
|
117
|
+
Field(
|
|
118
|
+
description=(
|
|
119
|
+
"The repo path inside the run container. Relative paths are resolved"
|
|
120
|
+
f" relative to the working directory. Defaults to `{LEGACY_REPO_DIR}`"
|
|
121
|
+
)
|
|
122
|
+
),
|
|
123
|
+
] = None
|
|
117
124
|
|
|
118
125
|
@classmethod
|
|
119
126
|
def parse(cls, v: str) -> Self:
|
|
@@ -149,6 +156,14 @@ class RepoSpec(CoreModel):
|
|
|
149
156
|
raise ValueError("Either `local_path` or `url` must be specified")
|
|
150
157
|
return values
|
|
151
158
|
|
|
159
|
+
@validator("path")
|
|
160
|
+
def validate_path(cls, v: Optional[str]) -> Optional[str]:
|
|
161
|
+
if v is None:
|
|
162
|
+
return v
|
|
163
|
+
if v.startswith("~") and PurePosixPath(v).parts[0] != "~":
|
|
164
|
+
raise ValueError("`~username` syntax is not supported")
|
|
165
|
+
return v
|
|
166
|
+
|
|
152
167
|
|
|
153
168
|
class ScalingSpec(CoreModel):
|
|
154
169
|
metric: Annotated[
|
|
@@ -380,7 +395,7 @@ class BaseRunConfiguration(CoreModel):
|
|
|
380
395
|
Field(
|
|
381
396
|
description=(
|
|
382
397
|
"The user inside the container, `user_name_or_id[:group_name_or_id]`"
|
|
383
|
-
" (e.g., `ubuntu`, `1000:1000`). Defaults to the default `image`
|
|
398
|
+
" (e.g., `ubuntu`, `1000:1000`). Defaults to the default user from the `image`"
|
|
384
399
|
)
|
|
385
400
|
),
|
|
386
401
|
] = None
|
|
@@ -390,9 +405,8 @@ class BaseRunConfiguration(CoreModel):
|
|
|
390
405
|
Optional[str],
|
|
391
406
|
Field(
|
|
392
407
|
description=(
|
|
393
|
-
"The path to the working directory inside the container."
|
|
394
|
-
f"
|
|
395
|
-
' Defaults to `"."` '
|
|
408
|
+
"The absolute path to the working directory inside the container."
|
|
409
|
+
f" Defaults to `{LEGACY_REPO_DIR}`"
|
|
396
410
|
)
|
|
397
411
|
),
|
|
398
412
|
] = None
|
|
@@ -19,7 +19,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
19
19
|
TerminationPolicy,
|
|
20
20
|
parse_idle_duration,
|
|
21
21
|
)
|
|
22
|
-
from dstack._internal.core.models.resources import
|
|
22
|
+
from dstack._internal.core.models.resources import ResourcesSpec
|
|
23
23
|
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
24
24
|
from dstack._internal.utils.json_schema import add_extra_schema_types
|
|
25
25
|
from dstack._internal.utils.tags import tags_validator
|
|
@@ -141,6 +141,67 @@ class SSHParams(CoreModel):
|
|
|
141
141
|
return value
|
|
142
142
|
|
|
143
143
|
|
|
144
|
+
class FleetNodesSpec(CoreModel):
|
|
145
|
+
min: Annotated[
|
|
146
|
+
int, Field(description=("The minimum number of instances to maintain in the fleet"))
|
|
147
|
+
]
|
|
148
|
+
target: Annotated[
|
|
149
|
+
int,
|
|
150
|
+
Field(
|
|
151
|
+
description=(
|
|
152
|
+
"The number of instances to provision on fleet apply. `min` <= `target` <= `max`"
|
|
153
|
+
" Defaults to `min`"
|
|
154
|
+
)
|
|
155
|
+
),
|
|
156
|
+
]
|
|
157
|
+
max: Annotated[
|
|
158
|
+
Optional[int],
|
|
159
|
+
Field(
|
|
160
|
+
description=(
|
|
161
|
+
"The maximum number of instances allowed in the fleet. Unlimited if not specified"
|
|
162
|
+
)
|
|
163
|
+
),
|
|
164
|
+
] = None
|
|
165
|
+
|
|
166
|
+
def dict(self, *args, **kwargs) -> Dict:
|
|
167
|
+
# super() does not work with pydantic-duality
|
|
168
|
+
res = CoreModel.dict(self, *args, **kwargs)
|
|
169
|
+
# For backward compatibility with old clients
|
|
170
|
+
# that do not ignore extra fields due to https://github.com/dstackai/dstack/issues/3066
|
|
171
|
+
if "target" in res and res["target"] == res["min"]:
|
|
172
|
+
del res["target"]
|
|
173
|
+
return res
|
|
174
|
+
|
|
175
|
+
@root_validator(pre=True)
|
|
176
|
+
def set_min_and_target_defaults(cls, values):
|
|
177
|
+
min_ = values.get("min")
|
|
178
|
+
target = values.get("target")
|
|
179
|
+
if min_ is None:
|
|
180
|
+
values["min"] = 0
|
|
181
|
+
if target is None:
|
|
182
|
+
values["target"] = values["min"]
|
|
183
|
+
return values
|
|
184
|
+
|
|
185
|
+
@validator("min")
|
|
186
|
+
def validate_min(cls, v: int) -> int:
|
|
187
|
+
if v < 0:
|
|
188
|
+
raise ValueError("min cannot be negative")
|
|
189
|
+
return v
|
|
190
|
+
|
|
191
|
+
@root_validator(skip_on_failure=True)
|
|
192
|
+
def _post_validate_ranges(cls, values):
|
|
193
|
+
min_ = values["min"]
|
|
194
|
+
target = values["target"]
|
|
195
|
+
max_ = values.get("max")
|
|
196
|
+
if target < min_:
|
|
197
|
+
raise ValueError("target must not be be less than min")
|
|
198
|
+
if max_ is not None and max_ < min_:
|
|
199
|
+
raise ValueError("max must not be less than min")
|
|
200
|
+
if max_ is not None and max_ < target:
|
|
201
|
+
raise ValueError("max must not be less than target")
|
|
202
|
+
return values
|
|
203
|
+
|
|
204
|
+
|
|
144
205
|
class InstanceGroupParams(CoreModel):
|
|
145
206
|
env: Annotated[
|
|
146
207
|
Env,
|
|
@@ -151,7 +212,9 @@ class InstanceGroupParams(CoreModel):
|
|
|
151
212
|
Field(description="The parameters for adding instances via SSH"),
|
|
152
213
|
] = None
|
|
153
214
|
|
|
154
|
-
nodes: Annotated[
|
|
215
|
+
nodes: Annotated[
|
|
216
|
+
Optional[FleetNodesSpec], Field(description="The number of instances in cloud fleet")
|
|
217
|
+
] = None
|
|
155
218
|
placement: Annotated[
|
|
156
219
|
Optional[InstanceGroupPlacement],
|
|
157
220
|
Field(description="The placement of instances: `any` or `cluster`"),
|
|
@@ -248,6 +311,16 @@ class InstanceGroupParams(CoreModel):
|
|
|
248
311
|
extra_types=[{"type": "string"}],
|
|
249
312
|
)
|
|
250
313
|
|
|
314
|
+
@validator("nodes", pre=True)
|
|
315
|
+
def parse_nodes(cls, v: Optional[Union[dict, str]]) -> Optional[dict]:
|
|
316
|
+
if isinstance(v, str) and ".." in v:
|
|
317
|
+
v = v.replace(" ", "")
|
|
318
|
+
min, max = v.split("..")
|
|
319
|
+
return dict(min=min or None, max=max or None)
|
|
320
|
+
elif isinstance(v, str) or isinstance(v, int):
|
|
321
|
+
return dict(min=v, max=v)
|
|
322
|
+
return v
|
|
323
|
+
|
|
251
324
|
_validate_idle_duration = validator("idle_duration", pre=True, allow_reuse=True)(
|
|
252
325
|
parse_idle_duration
|
|
253
326
|
)
|
|
@@ -10,7 +10,7 @@ from dstack._internal.core.models.backends.base import BackendType
|
|
|
10
10
|
from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
|
|
11
11
|
from dstack._internal.core.models.configurations import (
|
|
12
12
|
DEFAULT_PROBE_METHOD,
|
|
13
|
-
|
|
13
|
+
LEGACY_REPO_DIR,
|
|
14
14
|
AnyRunConfiguration,
|
|
15
15
|
HTTPHeaderSpec,
|
|
16
16
|
HTTPMethod,
|
|
@@ -259,6 +259,7 @@ class JobSpec(CoreModel):
|
|
|
259
259
|
retry: Optional[Retry]
|
|
260
260
|
volumes: Optional[List[MountPoint]] = None
|
|
261
261
|
ssh_key: Optional[JobSSHKey] = None
|
|
262
|
+
# `working_dir` is always absolute (if not None) since 0.19.27
|
|
262
263
|
working_dir: Optional[str]
|
|
263
264
|
# `repo_data` is optional for client compatibility with pre-0.19.17 servers and for compatibility
|
|
264
265
|
# with jobs submitted before 0.19.17. All new jobs are expected to have non-None `repo_data`.
|
|
@@ -268,6 +269,8 @@ class JobSpec(CoreModel):
|
|
|
268
269
|
# submitted before 0.19.17. See `_get_repo_code_hash` on how to get the correct `repo_code_hash`
|
|
269
270
|
# TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
|
|
270
271
|
repo_code_hash: Optional[str] = None
|
|
272
|
+
# `repo_dir` was added in 0.19.27. Default value is set for backward compatibility
|
|
273
|
+
repo_dir: str = LEGACY_REPO_DIR
|
|
271
274
|
file_archives: list[FileArchiveMapping] = []
|
|
272
275
|
# None for non-services and pre-0.19.19 services. See `get_service_port`
|
|
273
276
|
service_port: Optional[int] = None
|
|
@@ -409,17 +412,27 @@ class RunSpec(CoreModel):
|
|
|
409
412
|
Optional[str],
|
|
410
413
|
Field(description="The hash of the repo diff. Can be omitted if there is no repo diff."),
|
|
411
414
|
] = None
|
|
415
|
+
repo_dir: Annotated[
|
|
416
|
+
Optional[str],
|
|
417
|
+
Field(
|
|
418
|
+
description=(
|
|
419
|
+
"The repo path inside the container. Relative paths are resolved"
|
|
420
|
+
f" relative to the working directory. Defaults to `{LEGACY_REPO_DIR}`."
|
|
421
|
+
)
|
|
422
|
+
),
|
|
423
|
+
] = None
|
|
412
424
|
file_archives: Annotated[
|
|
413
425
|
list[FileArchiveMapping],
|
|
414
|
-
Field(description="The list of file archive ID to container path mappings"),
|
|
426
|
+
Field(description="The list of file archive ID to container path mappings."),
|
|
415
427
|
] = []
|
|
428
|
+
# Server uses configuration.working_dir instead of this field since 0.19.27, but
|
|
429
|
+
# the field still exists for compatibility with older servers
|
|
416
430
|
working_dir: Annotated[
|
|
417
431
|
Optional[str],
|
|
418
432
|
Field(
|
|
419
433
|
description=(
|
|
420
|
-
"The path to the working directory inside the container."
|
|
421
|
-
|
|
422
|
-
' Defaults to `"."`.'
|
|
434
|
+
"The absolute path to the working directory inside the container."
|
|
435
|
+
" Defaults to the default working directory from the `image`."
|
|
423
436
|
)
|
|
424
437
|
),
|
|
425
438
|
] = None
|
|
@@ -506,10 +519,16 @@ class RunStatus(str, Enum):
|
|
|
506
519
|
return self in self.finished_statuses()
|
|
507
520
|
|
|
508
521
|
|
|
522
|
+
class RunFleet(CoreModel):
|
|
523
|
+
id: UUID4
|
|
524
|
+
name: str
|
|
525
|
+
|
|
526
|
+
|
|
509
527
|
class Run(CoreModel):
|
|
510
528
|
id: UUID4
|
|
511
529
|
project_name: str
|
|
512
530
|
user: str
|
|
531
|
+
fleet: Optional[RunFleet] = None
|
|
513
532
|
submitted_at: datetime
|
|
514
533
|
last_processed_at: datetime
|
|
515
534
|
status: RunStatus
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from contextlib import suppress
|
|
2
3
|
from pathlib import Path
|
|
4
|
+
from tempfile import NamedTemporaryFile
|
|
3
5
|
from typing import Optional, Union
|
|
4
6
|
|
|
5
7
|
import git.cmd
|
|
6
|
-
import requests
|
|
7
8
|
import yaml
|
|
8
9
|
from git.exc import GitCommandError
|
|
9
10
|
|
|
@@ -13,135 +14,139 @@ from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepo
|
|
|
13
14
|
from dstack._internal.core.models.repos.remote import GitRepoURL
|
|
14
15
|
from dstack._internal.utils.logging import get_logger
|
|
15
16
|
from dstack._internal.utils.path import PathLike
|
|
16
|
-
from dstack._internal.utils.ssh import
|
|
17
|
-
get_host_config,
|
|
18
|
-
make_ssh_command_for_git,
|
|
19
|
-
try_ssh_key_passphrase,
|
|
20
|
-
)
|
|
17
|
+
from dstack._internal.utils.ssh import get_host_config, make_git_env, try_ssh_key_passphrase
|
|
21
18
|
|
|
22
19
|
logger = get_logger(__name__)
|
|
23
20
|
|
|
24
21
|
gh_config_path = os.path.expanduser("~/.config/gh/hosts.yml")
|
|
25
22
|
default_ssh_key = os.path.expanduser("~/.ssh/id_rsa")
|
|
26
23
|
|
|
27
|
-
no_prompt_env = dict(GIT_TERMINAL_PROMPT="0")
|
|
28
|
-
|
|
29
24
|
|
|
30
25
|
class InvalidRepoCredentialsError(DstackError):
|
|
31
26
|
pass
|
|
32
27
|
|
|
33
28
|
|
|
34
|
-
def
|
|
29
|
+
def get_repo_creds_and_default_branch(
|
|
35
30
|
repo_url: str,
|
|
36
31
|
identity_file: Optional[PathLike] = None,
|
|
32
|
+
private_key: Optional[str] = None,
|
|
37
33
|
oauth_token: Optional[str] = None,
|
|
38
|
-
) -> RemoteRepoCreds:
|
|
34
|
+
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
39
35
|
url = GitRepoURL.parse(repo_url, get_ssh_config=get_host_config)
|
|
40
36
|
|
|
41
37
|
# no auth
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
38
|
+
with suppress(InvalidRepoCredentialsError):
|
|
39
|
+
return _get_repo_creds_and_default_branch_https(url)
|
|
40
|
+
|
|
41
|
+
# ssh key provided by the user or pulled from the server
|
|
42
|
+
if identity_file is not None or private_key is not None:
|
|
43
|
+
if identity_file is not None:
|
|
44
|
+
private_key = _read_private_key(identity_file)
|
|
45
|
+
return _get_repo_creds_and_default_branch_ssh(url, identity_file, private_key)
|
|
46
|
+
elif private_key is not None:
|
|
47
|
+
with NamedTemporaryFile("w+", 0o600) as f:
|
|
48
|
+
f.write(private_key)
|
|
49
|
+
f.flush()
|
|
50
|
+
return _get_repo_creds_and_default_branch_ssh(url, f.name, private_key)
|
|
51
|
+
else:
|
|
52
|
+
assert False, "should not reach here"
|
|
53
|
+
|
|
54
|
+
# oauth token provided by the user or pulled from the server
|
|
56
55
|
if oauth_token is not None:
|
|
57
|
-
return
|
|
56
|
+
return _get_repo_creds_and_default_branch_https(url, oauth_token)
|
|
58
57
|
|
|
59
58
|
# key from ssh config
|
|
60
59
|
identities = get_host_config(url.original_host).get("identityfile")
|
|
61
60
|
if identities:
|
|
62
|
-
|
|
61
|
+
_identity_file = identities[0]
|
|
62
|
+
with suppress(InvalidRepoCredentialsError):
|
|
63
|
+
_private_key = _read_private_key(_identity_file)
|
|
64
|
+
return _get_repo_creds_and_default_branch_ssh(url, _identity_file, _private_key)
|
|
63
65
|
|
|
64
66
|
# token from gh config
|
|
65
67
|
if os.path.exists(gh_config_path):
|
|
66
68
|
with open(gh_config_path, "r") as f:
|
|
67
69
|
gh_hosts = yaml.load(f, Loader=yaml.FullLoader)
|
|
68
|
-
|
|
69
|
-
if
|
|
70
|
-
|
|
71
|
-
return
|
|
72
|
-
except InvalidRepoCredentialsError:
|
|
73
|
-
pass
|
|
70
|
+
_oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
|
|
71
|
+
if _oauth_token is not None:
|
|
72
|
+
with suppress(InvalidRepoCredentialsError):
|
|
73
|
+
return _get_repo_creds_and_default_branch_https(url, _oauth_token)
|
|
74
74
|
|
|
75
75
|
# default user key
|
|
76
76
|
if os.path.exists(default_ssh_key):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
pass
|
|
77
|
+
with suppress(InvalidRepoCredentialsError):
|
|
78
|
+
_private_key = _read_private_key(default_ssh_key)
|
|
79
|
+
return _get_repo_creds_and_default_branch_ssh(url, default_ssh_key, _private_key)
|
|
81
80
|
|
|
82
81
|
raise InvalidRepoCredentialsError(
|
|
83
82
|
"No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
|
|
84
83
|
)
|
|
85
84
|
|
|
86
85
|
|
|
87
|
-
def
|
|
86
|
+
def _get_repo_creds_and_default_branch_ssh(
|
|
87
|
+
url: GitRepoURL, identity_file: PathLike, private_key: str
|
|
88
|
+
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
89
|
+
_url = url.as_ssh()
|
|
88
90
|
try:
|
|
89
|
-
|
|
90
|
-
except GitCommandError:
|
|
91
|
-
|
|
92
|
-
raise InvalidRepoCredentialsError(
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
91
|
+
default_branch = _get_repo_default_branch(_url, make_git_env(identity_file=identity_file))
|
|
92
|
+
except GitCommandError as e:
|
|
93
|
+
message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
|
|
94
|
+
raise InvalidRepoCredentialsError(message) from e
|
|
95
|
+
creds = RemoteRepoCreds(
|
|
96
|
+
clone_url=_url,
|
|
97
|
+
private_key=private_key,
|
|
98
|
+
oauth_token=None,
|
|
99
|
+
)
|
|
100
|
+
return creds, default_branch
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_repo_creds_and_default_branch_https(
|
|
104
|
+
url: GitRepoURL, oauth_token: Optional[str] = None
|
|
105
|
+
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
106
|
+
_url = url.as_https()
|
|
107
|
+
try:
|
|
108
|
+
default_branch = _get_repo_default_branch(url.as_https(oauth_token), make_git_env())
|
|
109
|
+
except GitCommandError as e:
|
|
110
|
+
message = f"Cannot access `{_url}`"
|
|
111
|
+
if oauth_token is not None:
|
|
112
|
+
masked_token = len(oauth_token[:-4]) * "*" + oauth_token[-4:]
|
|
113
|
+
message = f"{message} using the `{masked_token}` token"
|
|
114
|
+
raise InvalidRepoCredentialsError(message) from e
|
|
115
|
+
creds = RemoteRepoCreds(
|
|
116
|
+
clone_url=_url,
|
|
98
117
|
private_key=None,
|
|
118
|
+
oauth_token=oauth_token,
|
|
99
119
|
)
|
|
120
|
+
return creds, default_branch
|
|
100
121
|
|
|
101
122
|
|
|
102
|
-
def
|
|
123
|
+
def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
|
|
124
|
+
# output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
|
|
125
|
+
output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
|
|
126
|
+
for line in output.splitlines():
|
|
127
|
+
# line format: `<oid> TAB <ref> LF`
|
|
128
|
+
oid, _, ref = line.partition("\t")
|
|
129
|
+
if oid.startswith("ref:") and ref == "HEAD":
|
|
130
|
+
return oid.rsplit("/", maxsplit=1)[-1]
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _read_private_key(identity_file: PathLike) -> str:
|
|
135
|
+
identity_file = Path(identity_file).expanduser().resolve()
|
|
103
136
|
if not Path(identity_file).exists():
|
|
104
|
-
raise InvalidRepoCredentialsError(f"The {identity_file} private SSH key doesn't exist")
|
|
137
|
+
raise InvalidRepoCredentialsError(f"The `{identity_file}` private SSH key doesn't exist")
|
|
105
138
|
if not os.access(identity_file, os.R_OK):
|
|
106
|
-
raise InvalidRepoCredentialsError(f"
|
|
139
|
+
raise InvalidRepoCredentialsError(f"Cannot access the `{identity_file}` private SSH key")
|
|
107
140
|
if not try_ssh_key_passphrase(identity_file):
|
|
108
141
|
raise InvalidRepoCredentialsError(
|
|
109
142
|
f"Cannot use the `{identity_file}` private SSH key. "
|
|
110
143
|
"Ensure that it is valid and passphrase-free"
|
|
111
144
|
)
|
|
112
|
-
with open(identity_file, "r") as
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
git.cmd.Git().ls_remote(
|
|
117
|
-
url.as_ssh(), env=dict(GIT_SSH_COMMAND=make_ssh_command_for_git(identity_file))
|
|
118
|
-
)
|
|
119
|
-
except GitCommandError:
|
|
120
|
-
raise InvalidRepoCredentialsError(
|
|
121
|
-
f"Can't access `{url.as_ssh()}` using the `{identity_file}` private SSH key"
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
return RemoteRepoCreds(
|
|
125
|
-
clone_url=url.as_ssh(),
|
|
126
|
-
private_key=private_key,
|
|
127
|
-
oauth_token=None,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def get_default_branch(remote_url: str) -> Optional[str]:
|
|
132
|
-
"""
|
|
133
|
-
Get the default branch of a remote Git repository.
|
|
134
|
-
"""
|
|
135
|
-
try:
|
|
136
|
-
output = git.cmd.Git().ls_remote("--symref", remote_url, "HEAD", env=no_prompt_env)
|
|
137
|
-
for line in output.splitlines():
|
|
138
|
-
if line.startswith("ref:"):
|
|
139
|
-
return line.split()[1].split("/")[-1]
|
|
140
|
-
except Exception as e:
|
|
141
|
-
logger.debug("Failed to get remote repo default branch: %s", repr(e))
|
|
142
|
-
return None
|
|
145
|
+
with open(identity_file, "r") as file:
|
|
146
|
+
return file.read()
|
|
143
147
|
|
|
144
148
|
|
|
149
|
+
# Used for `config.yml` only, remove it with `repos` in `config.yml`
|
|
145
150
|
def load_repo(config: RepoConfig) -> Union[RemoteRepo, LocalRepo]:
|
|
146
151
|
if config.repo_type == "remote":
|
|
147
152
|
return RemoteRepo(repo_id=config.repo_id, local_repo_dir=config.path)
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from datetime import timedelta
|
|
2
2
|
from typing import List
|
|
3
|
+
from uuid import UUID
|
|
3
4
|
|
|
4
5
|
from sqlalchemy import select, update
|
|
5
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
7
|
from sqlalchemy.orm import joinedload, load_only
|
|
7
8
|
|
|
8
|
-
from dstack._internal.core.models.fleets import FleetStatus
|
|
9
|
+
from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
|
|
10
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
9
11
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
10
12
|
from dstack._internal.server.models import (
|
|
11
13
|
FleetModel,
|
|
@@ -15,7 +17,9 @@ from dstack._internal.server.models import (
|
|
|
15
17
|
RunModel,
|
|
16
18
|
)
|
|
17
19
|
from dstack._internal.server.services.fleets import (
|
|
20
|
+
create_fleet_instance_model,
|
|
18
21
|
get_fleet_spec,
|
|
22
|
+
get_next_instance_num,
|
|
19
23
|
is_fleet_empty,
|
|
20
24
|
is_fleet_in_use,
|
|
21
25
|
)
|
|
@@ -65,31 +69,111 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
|
|
|
65
69
|
res = await session.execute(
|
|
66
70
|
select(FleetModel)
|
|
67
71
|
.where(FleetModel.id.in_(fleet_ids))
|
|
68
|
-
.options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
|
|
69
72
|
.options(
|
|
70
|
-
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
|
|
73
|
+
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
|
|
74
|
+
joinedload(FleetModel.project),
|
|
71
75
|
)
|
|
72
76
|
.options(joinedload(FleetModel.runs).load_only(RunModel.status))
|
|
73
77
|
.execution_options(populate_existing=True)
|
|
74
78
|
)
|
|
75
79
|
fleet_models = list(res.unique().scalars().all())
|
|
76
80
|
|
|
81
|
+
# TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
|
|
77
82
|
deleted_fleets_ids = []
|
|
78
|
-
now = get_current_datetime()
|
|
79
83
|
for fleet_model in fleet_models:
|
|
84
|
+
_consolidate_fleet_state_with_spec(session, fleet_model)
|
|
80
85
|
deleted = _autodelete_fleet(fleet_model)
|
|
81
86
|
if deleted:
|
|
82
87
|
deleted_fleets_ids.append(fleet_model.id)
|
|
83
|
-
fleet_model.last_processed_at =
|
|
88
|
+
fleet_model.last_processed_at = get_current_datetime()
|
|
89
|
+
await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
|
|
90
|
+
await session.commit()
|
|
84
91
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
|
|
93
|
+
def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
|
|
94
|
+
if fleet_model.status == FleetStatus.TERMINATING:
|
|
95
|
+
return
|
|
96
|
+
fleet_spec = get_fleet_spec(fleet_model)
|
|
97
|
+
if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
|
|
98
|
+
# Only explicitly created cloud fleets are consolidated.
|
|
99
|
+
return
|
|
100
|
+
if not _is_fleet_ready_for_consolidation(fleet_model):
|
|
101
|
+
return
|
|
102
|
+
added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
|
|
103
|
+
if added_instances:
|
|
104
|
+
fleet_model.consolidation_attempt += 1
|
|
105
|
+
else:
|
|
106
|
+
# The fleet is already consolidated or consolidation is in progress.
|
|
107
|
+
# We reset consolidation_attempt in both cases for simplicity.
|
|
108
|
+
# The second case does not need reset but is ok to do since
|
|
109
|
+
# it means consolidation is longer than delay, so it won't happen too often.
|
|
110
|
+
# TODO: Reset consolidation_attempt on fleet in-place update.
|
|
111
|
+
fleet_model.consolidation_attempt = 0
|
|
112
|
+
fleet_model.last_consolidated_at = get_current_datetime()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
|
|
116
|
+
consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
|
|
117
|
+
last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
|
|
118
|
+
duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
|
|
119
|
+
return duration_since_last_consolidation >= consolidation_retry_delay
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# We use exponentially increasing consolidation retry delays so that
|
|
123
|
+
# consolidation does not happen too often. In particular, this prevents
|
|
124
|
+
# retrying instance provisioning constantly in case of no offers.
|
|
125
|
+
# TODO: Adjust delays.
|
|
126
|
+
_CONSOLIDATION_RETRY_DELAYS = [
|
|
127
|
+
timedelta(seconds=30),
|
|
128
|
+
timedelta(minutes=1),
|
|
129
|
+
timedelta(minutes=2),
|
|
130
|
+
timedelta(minutes=5),
|
|
131
|
+
timedelta(minutes=10),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
|
|
136
|
+
if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
|
|
137
|
+
return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
|
|
138
|
+
return _CONSOLIDATION_RETRY_DELAYS[-1]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _maintain_fleet_nodes_min(
|
|
142
|
+
session: AsyncSession,
|
|
143
|
+
fleet_model: FleetModel,
|
|
144
|
+
fleet_spec: FleetSpec,
|
|
145
|
+
) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Ensures the fleet has at least `nodes.min` instances.
|
|
148
|
+
Returns `True` if retried or added new instances and `False` otherwise.
|
|
149
|
+
"""
|
|
150
|
+
assert fleet_spec.configuration.nodes is not None
|
|
151
|
+
for instance in fleet_model.instances:
|
|
152
|
+
# Delete terminated but not deleted instances since
|
|
153
|
+
# they are going to be replaced with new pending instances.
|
|
154
|
+
if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
|
|
155
|
+
# It's safe to modify instances without instance lock since
|
|
156
|
+
# no other task modifies already terminated instances.
|
|
157
|
+
instance.deleted = True
|
|
158
|
+
instance.deleted_at = get_current_datetime()
|
|
159
|
+
active_instances = [i for i in fleet_model.instances if not i.deleted]
|
|
160
|
+
active_instances_num = len(active_instances)
|
|
161
|
+
if active_instances_num >= fleet_spec.configuration.nodes.min:
|
|
162
|
+
return False
|
|
163
|
+
nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
|
|
164
|
+
for i in range(nodes_missing):
|
|
165
|
+
instance_model = create_fleet_instance_model(
|
|
166
|
+
session=session,
|
|
167
|
+
project=fleet_model.project,
|
|
168
|
+
# TODO: Store fleet.user and pass it instead of the project owner.
|
|
169
|
+
username=fleet_model.project.owner.name,
|
|
170
|
+
spec=fleet_spec,
|
|
171
|
+
instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
|
|
89
172
|
)
|
|
90
|
-
.
|
|
91
|
-
|
|
92
|
-
|
|
173
|
+
active_instances.append(instance_model)
|
|
174
|
+
fleet_model.instances.append(instance_model)
|
|
175
|
+
logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
|
|
176
|
+
return True
|
|
93
177
|
|
|
94
178
|
|
|
95
179
|
def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
@@ -100,7 +184,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
|
100
184
|
if (
|
|
101
185
|
fleet_model.status != FleetStatus.TERMINATING
|
|
102
186
|
and fleet_spec.configuration.nodes is not None
|
|
103
|
-
and
|
|
187
|
+
and fleet_spec.configuration.nodes.min == 0
|
|
104
188
|
):
|
|
105
189
|
# Empty fleets that allow 0 nodes should not be auto-deleted
|
|
106
190
|
return False
|
|
@@ -110,3 +194,15 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
|
110
194
|
fleet_model.deleted = True
|
|
111
195
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
112
196
|
return True
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
|
|
200
|
+
if len(fleets_ids) == 0:
|
|
201
|
+
return
|
|
202
|
+
await session.execute(
|
|
203
|
+
update(PlacementGroupModel)
|
|
204
|
+
.where(
|
|
205
|
+
PlacementGroupModel.fleet_id.in_(fleets_ids),
|
|
206
|
+
)
|
|
207
|
+
.values(fleet_deleted=True)
|
|
208
|
+
)
|