dstack 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/main.py +3 -1
- dstack/_internal/cli/services/configurators/fleet.py +20 -6
- dstack/_internal/cli/utils/gpu.py +2 -2
- dstack/_internal/core/backends/aws/compute.py +62 -41
- dstack/_internal/core/backends/aws/resources.py +11 -6
- dstack/_internal/core/backends/azure/compute.py +25 -13
- dstack/_internal/core/backends/base/compute.py +121 -14
- dstack/_internal/core/backends/base/offers.py +34 -4
- dstack/_internal/core/backends/cloudrift/compute.py +5 -7
- dstack/_internal/core/backends/cudo/compute.py +4 -2
- dstack/_internal/core/backends/datacrunch/compute.py +13 -11
- dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
- dstack/_internal/core/backends/gcp/compute.py +25 -11
- dstack/_internal/core/backends/hotaisle/compute.py +4 -7
- dstack/_internal/core/backends/kubernetes/compute.py +6 -4
- dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
- dstack/_internal/core/backends/local/compute.py +1 -3
- dstack/_internal/core/backends/nebius/compute.py +10 -7
- dstack/_internal/core/backends/oci/compute.py +15 -8
- dstack/_internal/core/backends/oci/resources.py +8 -3
- dstack/_internal/core/backends/runpod/compute.py +15 -6
- dstack/_internal/core/backends/template/compute.py.jinja +3 -1
- dstack/_internal/core/backends/tensordock/compute.py +1 -3
- dstack/_internal/core/backends/tensordock/models.py +2 -0
- dstack/_internal/core/backends/vastai/compute.py +7 -3
- dstack/_internal/core/backends/vultr/compute.py +5 -5
- dstack/_internal/core/consts.py +2 -0
- dstack/_internal/core/models/projects.py +8 -0
- dstack/_internal/core/services/repos.py +101 -10
- dstack/_internal/server/background/tasks/process_instances.py +3 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +100 -47
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/projects.py +11 -3
- dstack/_internal/server/services/runs.py +2 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
- dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js → main-c51afa7f243e24d3e446.js} +61081 -49037
- dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
- dstack/_internal/utils/ssh.py +22 -2
- dstack/version.py +2 -2
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/METADATA +8 -6
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/RECORD +46 -50
- dstack/_internal/core/backends/tensordock/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/api_client.py +0 -104
- dstack/_internal/core/backends/tensordock/backend.py +0 -16
- dstack/_internal/core/backends/tensordock/configurator.py +0 -74
- dstack/_internal/server/statics/main-5e0d56245c4bd241ec27.css +0 -3
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/WHEEL +0 -0
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -39,9 +39,7 @@ class TensorDockCompute(
|
|
|
39
39
|
self.config = config
|
|
40
40
|
self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token)
|
|
41
41
|
|
|
42
|
-
def get_offers(
|
|
43
|
-
self, requirements: Optional[Requirements] = None
|
|
44
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
42
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
45
43
|
offers = get_catalog_offers(
|
|
46
44
|
backend=BackendType.TENSORDOCK,
|
|
47
45
|
requirements=requirements,
|
|
@@ -4,6 +4,8 @@ from pydantic import Field
|
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.common import CoreModel
|
|
6
6
|
|
|
7
|
+
# TODO: TensorDock is deprecated and will be removed in the future
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class TensorDockAPIKeyCreds(CoreModel):
|
|
9
11
|
type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
|
|
@@ -5,6 +5,7 @@ from gpuhunt.providers.vastai import VastAIProvider
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithFilteredOffersCached,
|
|
8
9
|
generate_unique_instance_name_for_job,
|
|
9
10
|
get_docker_commands,
|
|
10
11
|
)
|
|
@@ -30,7 +31,10 @@ logger = get_logger(__name__)
|
|
|
30
31
|
MAX_INSTANCE_NAME_LEN = 60
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
class VastAICompute(
|
|
34
|
+
class VastAICompute(
|
|
35
|
+
ComputeWithFilteredOffersCached,
|
|
36
|
+
Compute,
|
|
37
|
+
):
|
|
34
38
|
def __init__(self, config: VastAIConfig):
|
|
35
39
|
super().__init__()
|
|
36
40
|
self.config = config
|
|
@@ -49,8 +53,8 @@ class VastAICompute(Compute):
|
|
|
49
53
|
)
|
|
50
54
|
)
|
|
51
55
|
|
|
52
|
-
def
|
|
53
|
-
self, requirements:
|
|
56
|
+
def get_offers_by_requirements(
|
|
57
|
+
self, requirements: Requirements
|
|
54
58
|
) -> List[InstanceOfferWithAvailability]:
|
|
55
59
|
offers = get_catalog_offers(
|
|
56
60
|
backend=BackendType.VASTAI,
|
|
@@ -6,6 +6,7 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
+
ComputeWithAllOffersCached,
|
|
9
10
|
ComputeWithCreateInstanceSupport,
|
|
10
11
|
ComputeWithMultinodeSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
24
|
InstanceOfferWithAvailability,
|
|
24
25
|
)
|
|
25
26
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
28
|
from dstack._internal.utils.logging import get_logger
|
|
28
29
|
|
|
29
30
|
logger = get_logger(__name__)
|
|
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 64
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class VultrCompute(
|
|
36
|
+
ComputeWithAllOffersCached,
|
|
35
37
|
ComputeWithCreateInstanceSupport,
|
|
36
38
|
ComputeWithMultinodeSupport,
|
|
37
39
|
Compute,
|
|
@@ -41,12 +43,10 @@ class VultrCompute(
|
|
|
41
43
|
self.config = config
|
|
42
44
|
self.api_client = VultrApiClient(config.creds.api_key)
|
|
43
45
|
|
|
44
|
-
def
|
|
45
|
-
self, requirements: Optional[Requirements] = None
|
|
46
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
46
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
47
47
|
offers = get_catalog_offers(
|
|
48
48
|
backend=BackendType.VULTR,
|
|
49
|
-
requirements=
|
|
49
|
+
requirements=None,
|
|
50
50
|
locations=self.config.regions or None,
|
|
51
51
|
extra_filter=_supported_instances,
|
|
52
52
|
)
|
dstack/_internal/core/consts.py
CHANGED
|
@@ -4,3 +4,5 @@ DSTACK_SHIM_HTTP_PORT = 10998
|
|
|
4
4
|
DSTACK_RUNNER_HTTP_PORT = 10999
|
|
5
5
|
# ssh server (runs alongside the runner inside a container) listen port
|
|
6
6
|
DSTACK_RUNNER_SSH_PORT = 10022
|
|
7
|
+
# legacy AWS, Azure, GCP, and OCI image for older GPUs
|
|
8
|
+
DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10"
|
|
@@ -26,3 +26,11 @@ class Project(CoreModel):
|
|
|
26
26
|
backends: List[BackendInfo]
|
|
27
27
|
members: List[Member]
|
|
28
28
|
is_public: bool = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ProjectHookConfig(CoreModel):
|
|
32
|
+
"""
|
|
33
|
+
This class can be inherited to extend the project creation configuration passed to the hooks.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
pass
|
|
@@ -36,24 +36,59 @@ def get_repo_creds_and_default_branch(
|
|
|
36
36
|
|
|
37
37
|
# no auth
|
|
38
38
|
with suppress(InvalidRepoCredentialsError):
|
|
39
|
-
|
|
39
|
+
creds, default_branch = _get_repo_creds_and_default_branch_https(url)
|
|
40
|
+
logger.debug(
|
|
41
|
+
"Git repo %s is public. Using no auth. Default branch: %s", repo_url, default_branch
|
|
42
|
+
)
|
|
43
|
+
return creds, default_branch
|
|
40
44
|
|
|
41
45
|
# ssh key provided by the user or pulled from the server
|
|
42
46
|
if identity_file is not None or private_key is not None:
|
|
43
47
|
if identity_file is not None:
|
|
44
48
|
private_key = _read_private_key(identity_file)
|
|
45
|
-
|
|
49
|
+
creds, default_branch = _get_repo_creds_and_default_branch_ssh(
|
|
50
|
+
url, identity_file, private_key
|
|
51
|
+
)
|
|
52
|
+
logger.debug(
|
|
53
|
+
"Git repo %s is private. Using identity file: %s. Default branch: %s",
|
|
54
|
+
repo_url,
|
|
55
|
+
identity_file,
|
|
56
|
+
default_branch,
|
|
57
|
+
)
|
|
58
|
+
return creds, default_branch
|
|
46
59
|
elif private_key is not None:
|
|
47
60
|
with NamedTemporaryFile("w+", 0o600) as f:
|
|
48
61
|
f.write(private_key)
|
|
49
62
|
f.flush()
|
|
50
|
-
|
|
63
|
+
creds, default_branch = _get_repo_creds_and_default_branch_ssh(
|
|
64
|
+
url, f.name, private_key
|
|
65
|
+
)
|
|
66
|
+
masked_key = "***" + private_key[-10:] if len(private_key) > 10 else "***MASKED***"
|
|
67
|
+
logger.debug(
|
|
68
|
+
"Git repo %s is private. Using private key: %s. Default branch: %s",
|
|
69
|
+
repo_url,
|
|
70
|
+
masked_key,
|
|
71
|
+
default_branch,
|
|
72
|
+
)
|
|
73
|
+
return creds, default_branch
|
|
51
74
|
else:
|
|
52
75
|
assert False, "should not reach here"
|
|
53
76
|
|
|
54
77
|
# oauth token provided by the user or pulled from the server
|
|
55
78
|
if oauth_token is not None:
|
|
56
|
-
|
|
79
|
+
creds, default_branch = _get_repo_creds_and_default_branch_https(url, oauth_token)
|
|
80
|
+
masked_token = (
|
|
81
|
+
len(oauth_token[:-4]) * "*" + oauth_token[-4:]
|
|
82
|
+
if len(oauth_token) > 4
|
|
83
|
+
else "***MASKED***"
|
|
84
|
+
)
|
|
85
|
+
logger.debug(
|
|
86
|
+
"Git repo %s is private. Using provided OAuth token: %s. Default branch: %s",
|
|
87
|
+
repo_url,
|
|
88
|
+
masked_token,
|
|
89
|
+
default_branch,
|
|
90
|
+
)
|
|
91
|
+
return creds, default_branch
|
|
57
92
|
|
|
58
93
|
# key from ssh config
|
|
59
94
|
identities = get_host_config(url.original_host).get("identityfile")
|
|
@@ -61,7 +96,16 @@ def get_repo_creds_and_default_branch(
|
|
|
61
96
|
_identity_file = identities[0]
|
|
62
97
|
with suppress(InvalidRepoCredentialsError):
|
|
63
98
|
_private_key = _read_private_key(_identity_file)
|
|
64
|
-
|
|
99
|
+
creds, default_branch = _get_repo_creds_and_default_branch_ssh(
|
|
100
|
+
url, _identity_file, _private_key
|
|
101
|
+
)
|
|
102
|
+
logger.debug(
|
|
103
|
+
"Git repo %s is private. Using SSH config identity file: %s. Default branch: %s",
|
|
104
|
+
repo_url,
|
|
105
|
+
_identity_file,
|
|
106
|
+
default_branch,
|
|
107
|
+
)
|
|
108
|
+
return creds, default_branch
|
|
65
109
|
|
|
66
110
|
# token from gh config
|
|
67
111
|
if os.path.exists(gh_config_path):
|
|
@@ -70,13 +114,35 @@ def get_repo_creds_and_default_branch(
|
|
|
70
114
|
_oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
|
|
71
115
|
if _oauth_token is not None:
|
|
72
116
|
with suppress(InvalidRepoCredentialsError):
|
|
73
|
-
|
|
117
|
+
creds, default_branch = _get_repo_creds_and_default_branch_https(url, _oauth_token)
|
|
118
|
+
masked_token = (
|
|
119
|
+
len(_oauth_token[:-4]) * "*" + _oauth_token[-4:]
|
|
120
|
+
if len(_oauth_token) > 4
|
|
121
|
+
else "***MASKED***"
|
|
122
|
+
)
|
|
123
|
+
logger.debug(
|
|
124
|
+
"Git repo %s is private. Using GitHub config token: %s from %s. Default branch: %s",
|
|
125
|
+
repo_url,
|
|
126
|
+
masked_token,
|
|
127
|
+
gh_config_path,
|
|
128
|
+
default_branch,
|
|
129
|
+
)
|
|
130
|
+
return creds, default_branch
|
|
74
131
|
|
|
75
132
|
# default user key
|
|
76
133
|
if os.path.exists(default_ssh_key):
|
|
77
134
|
with suppress(InvalidRepoCredentialsError):
|
|
78
135
|
_private_key = _read_private_key(default_ssh_key)
|
|
79
|
-
|
|
136
|
+
creds, default_branch = _get_repo_creds_and_default_branch_ssh(
|
|
137
|
+
url, default_ssh_key, _private_key
|
|
138
|
+
)
|
|
139
|
+
logger.debug(
|
|
140
|
+
"Git repo %s is private. Using default identity file: %s. Default branch: %s",
|
|
141
|
+
repo_url,
|
|
142
|
+
default_ssh_key,
|
|
143
|
+
default_branch,
|
|
144
|
+
)
|
|
145
|
+
return creds, default_branch
|
|
80
146
|
|
|
81
147
|
raise InvalidRepoCredentialsError(
|
|
82
148
|
"No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
|
|
@@ -87,8 +153,9 @@ def _get_repo_creds_and_default_branch_ssh(
|
|
|
87
153
|
url: GitRepoURL, identity_file: PathLike, private_key: str
|
|
88
154
|
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
89
155
|
_url = url.as_ssh()
|
|
156
|
+
env = _make_git_env_for_creds_check(identity_file=identity_file)
|
|
90
157
|
try:
|
|
91
|
-
default_branch = _get_repo_default_branch(_url,
|
|
158
|
+
default_branch = _get_repo_default_branch(_url, env)
|
|
92
159
|
except GitCommandError as e:
|
|
93
160
|
message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
|
|
94
161
|
raise InvalidRepoCredentialsError(message) from e
|
|
@@ -104,8 +171,9 @@ def _get_repo_creds_and_default_branch_https(
|
|
|
104
171
|
url: GitRepoURL, oauth_token: Optional[str] = None
|
|
105
172
|
) -> tuple[RemoteRepoCreds, Optional[str]]:
|
|
106
173
|
_url = url.as_https()
|
|
174
|
+
env = _make_git_env_for_creds_check()
|
|
107
175
|
try:
|
|
108
|
-
default_branch = _get_repo_default_branch(url.as_https(oauth_token),
|
|
176
|
+
default_branch = _get_repo_default_branch(url.as_https(oauth_token), env)
|
|
109
177
|
except GitCommandError as e:
|
|
110
178
|
message = f"Cannot access `{_url}`"
|
|
111
179
|
if oauth_token is not None:
|
|
@@ -120,9 +188,32 @@ def _get_repo_creds_and_default_branch_https(
|
|
|
120
188
|
return creds, default_branch
|
|
121
189
|
|
|
122
190
|
|
|
191
|
+
def _make_git_env_for_creds_check(identity_file: Optional[PathLike] = None) -> dict[str, str]:
|
|
192
|
+
# Our goal is to check if _provided_ creds (if any) are correct, so we need to be sure that
|
|
193
|
+
# only the provided creds are used, without falling back to any additional mechanisms.
|
|
194
|
+
# To do this, we:
|
|
195
|
+
# 1. Disable all configs to ignore any stored creds
|
|
196
|
+
# 2. Disable askpass to avoid asking for creds interactively or fetching stored creds from
|
|
197
|
+
# a non-interactive askpass helper (for example, VS Code sets GIT_ASKPASS to its own helper,
|
|
198
|
+
# which silently provides creds to Git).
|
|
199
|
+
return make_git_env(disable_config=True, disable_askpass=True, identity_file=identity_file)
|
|
200
|
+
|
|
201
|
+
|
|
123
202
|
def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
|
|
203
|
+
# Git shipped by Apple with XCode is patched to support an additional config scope
|
|
204
|
+
# above "system" called "xcode". There is no option in `git config list` to show this config,
|
|
205
|
+
# but you can list the merged config (`git config list` without options) and then exclude
|
|
206
|
+
# all settings listed in `git config list --{system,global,local,worktree}`.
|
|
207
|
+
# As of time of writing, there are only two settings in the "xcode" config, one of which breaks
|
|
208
|
+
# our "is repo public?" check, namely "credential.helper=osxkeychain".
|
|
209
|
+
# As there is no way to disable "xcode" config (no env variable, no CLI option, etc.),
|
|
210
|
+
# the only way to disable credential helper is to override this specific setting with an empty
|
|
211
|
+
# string via command line argument: `git -c credential.helper= COMMAND [ARGS ...]`.
|
|
212
|
+
# See: https://github.com/git/git/commit/3d4355712b9fe77a96ad4ad877d92dc7ff6e0874
|
|
213
|
+
# See: https://gist.github.com/ChrisTollefson/ab9c0a5d1dd4dd615217345c6936a307
|
|
214
|
+
_git = git.cmd.Git()(c="credential.helper=")
|
|
124
215
|
# output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
|
|
125
|
-
output: str =
|
|
216
|
+
output: str = _git.ls_remote("--symref", url, "HEAD", env=env)
|
|
126
217
|
for line in output.splitlines():
|
|
127
218
|
# line format: `<oid> TAB <ref> LF`
|
|
128
219
|
oid, _, ref = line.partition("\t")
|
|
@@ -578,7 +578,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
578
578
|
if placement_group_model is None: # error occurred
|
|
579
579
|
continue
|
|
580
580
|
session.add(placement_group_model)
|
|
581
|
-
await session.flush()
|
|
582
581
|
placement_group_models.append(placement_group_model)
|
|
583
582
|
logger.debug(
|
|
584
583
|
"Trying %s in %s/%s for $%0.4f per hour",
|
|
@@ -636,7 +635,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
636
635
|
},
|
|
637
636
|
)
|
|
638
637
|
if instance.fleet_id and _is_fleet_master_instance(instance):
|
|
639
|
-
# Clean up placement groups that did not end up being used
|
|
638
|
+
# Clean up placement groups that did not end up being used.
|
|
639
|
+
# Flush to update still uncommitted placement groups.
|
|
640
|
+
await session.flush()
|
|
640
641
|
await schedule_fleet_placement_groups_deletion(
|
|
641
642
|
session=session,
|
|
642
643
|
fleet_id=instance.fleet_id,
|
|
@@ -1139,7 +1139,7 @@ def _patch_base_image_for_aws_efa(
|
|
|
1139
1139
|
efa_enabled_patterns = [
|
|
1140
1140
|
# TODO: p6-b200 isn't supported yet in gpuhunt
|
|
1141
1141
|
r"^p6-b200\.(48xlarge)$",
|
|
1142
|
-
r"^p5\.(48xlarge)$",
|
|
1142
|
+
r"^p5\.(4xlarge|48xlarge)$",
|
|
1143
1143
|
r"^p5e\.(48xlarge)$",
|
|
1144
1144
|
r"^p5en\.(48xlarge)$",
|
|
1145
1145
|
r"^p4d\.(24xlarge)$",
|
|
@@ -3,7 +3,7 @@ import itertools
|
|
|
3
3
|
import math
|
|
4
4
|
import uuid
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import List, Optional
|
|
7
7
|
|
|
8
8
|
from sqlalchemy import and_, func, not_, or_, select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
@@ -25,6 +25,7 @@ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
|
|
|
25
25
|
from dstack._internal.core.models.profiles import (
|
|
26
26
|
DEFAULT_RUN_TERMINATION_IDLE_TIME,
|
|
27
27
|
CreationPolicy,
|
|
28
|
+
Profile,
|
|
28
29
|
TerminationPolicy,
|
|
29
30
|
)
|
|
30
31
|
from dstack._internal.core.models.resources import Memory
|
|
@@ -34,6 +35,7 @@ from dstack._internal.core.models.runs import (
|
|
|
34
35
|
JobRuntimeData,
|
|
35
36
|
JobStatus,
|
|
36
37
|
JobTerminationReason,
|
|
38
|
+
Requirements,
|
|
37
39
|
Run,
|
|
38
40
|
RunSpec,
|
|
39
41
|
)
|
|
@@ -186,7 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
186
188
|
project = run_model.project
|
|
187
189
|
run = run_model_to_run(run_model)
|
|
188
190
|
run_spec = run.run_spec
|
|
189
|
-
|
|
191
|
+
run_profile = run_spec.merged_profile
|
|
190
192
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
191
193
|
multinode = job.job_spec.jobs_per_replica > 1
|
|
192
194
|
|
|
@@ -289,7 +291,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
289
291
|
instance_filters=instance_filters,
|
|
290
292
|
)
|
|
291
293
|
fleet_models = fleet_models_with_instances + fleet_models_without_instances
|
|
292
|
-
fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
|
|
294
|
+
fleet_model, fleet_instances_with_offers = await _find_optimal_fleet_with_offers(
|
|
295
|
+
project=project,
|
|
293
296
|
fleet_models=fleet_models,
|
|
294
297
|
run_model=run_model,
|
|
295
298
|
run_spec=run.run_spec,
|
|
@@ -332,7 +335,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
332
335
|
job_model.status = JobStatus.PROVISIONING
|
|
333
336
|
else:
|
|
334
337
|
# Assigned no instance, create a new one
|
|
335
|
-
if
|
|
338
|
+
if run_profile.creation_policy == CreationPolicy.REUSE:
|
|
336
339
|
logger.debug("%s: reuse instance failed", fmt(job_model))
|
|
337
340
|
job_model.status = JobStatus.TERMINATING
|
|
338
341
|
job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
@@ -361,7 +364,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
361
364
|
return
|
|
362
365
|
|
|
363
366
|
logger.info("%s: now is provisioning a new instance", fmt(job_model))
|
|
364
|
-
job_provisioning_data, offer = run_job_result
|
|
367
|
+
job_provisioning_data, offer, effective_profile, _ = run_job_result
|
|
365
368
|
job_model.job_provisioning_data = job_provisioning_data.json()
|
|
366
369
|
job_model.status = JobStatus.PROVISIONING
|
|
367
370
|
if fleet_model is None:
|
|
@@ -381,12 +384,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
381
384
|
instance = _create_instance_model_for_job(
|
|
382
385
|
project=project,
|
|
383
386
|
fleet_model=fleet_model,
|
|
384
|
-
run_spec=run_spec,
|
|
385
387
|
job_model=job_model,
|
|
386
|
-
job=job,
|
|
387
388
|
job_provisioning_data=job_provisioning_data,
|
|
388
389
|
offer=offer,
|
|
389
390
|
instance_num=instance_num,
|
|
391
|
+
profile=effective_profile,
|
|
390
392
|
)
|
|
391
393
|
job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
|
|
392
394
|
# Both this task and process_fleets can add instances to fleets.
|
|
@@ -492,7 +494,8 @@ async def _refetch_fleet_models_with_instances(
|
|
|
492
494
|
return fleet_models
|
|
493
495
|
|
|
494
496
|
|
|
495
|
-
def _find_optimal_fleet_with_offers(
|
|
497
|
+
async def _find_optimal_fleet_with_offers(
|
|
498
|
+
project: ProjectModel,
|
|
496
499
|
fleet_models: list[FleetModel],
|
|
497
500
|
run_model: RunModel,
|
|
498
501
|
run_spec: RunSpec,
|
|
@@ -502,58 +505,98 @@ def _find_optimal_fleet_with_offers(
|
|
|
502
505
|
) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
|
|
503
506
|
if run_model.fleet is not None:
|
|
504
507
|
# Using the fleet that was already chosen by the master job
|
|
505
|
-
|
|
508
|
+
fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
|
|
506
509
|
fleet_model=run_model.fleet,
|
|
507
510
|
run_spec=run_spec,
|
|
508
511
|
job=job,
|
|
509
512
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
510
513
|
volumes=volumes,
|
|
511
514
|
)
|
|
512
|
-
return run_model.fleet,
|
|
515
|
+
return run_model.fleet, fleet_instances_with_pool_offers
|
|
513
516
|
|
|
514
517
|
if len(fleet_models) == 0:
|
|
515
518
|
return None, []
|
|
516
519
|
|
|
517
520
|
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
518
|
-
# The current strategy is to
|
|
519
|
-
# the run without additional provisioning and choose the one with the cheapest offer.
|
|
520
|
-
#
|
|
521
|
+
# The current strategy is first to consider fleets that can accommodate
|
|
522
|
+
# the run without additional provisioning and choose the one with the cheapest pool offer.
|
|
523
|
+
# Then choose a fleet with the cheapest pool offer among all fleets with pool offers.
|
|
524
|
+
# If there are no fleets with pool offers, choose a fleet with a cheapest backend offer.
|
|
525
|
+
# Fallback to autocreated fleet if fleets have no pool or backend offers.
|
|
526
|
+
# TODO: Consider trying all backend offers and then choosing a fleet.
|
|
521
527
|
candidate_fleets_with_offers: list[
|
|
522
528
|
tuple[
|
|
523
529
|
Optional[FleetModel],
|
|
524
530
|
list[tuple[InstanceModel, InstanceOfferWithAvailability]],
|
|
525
531
|
int,
|
|
526
|
-
|
|
532
|
+
int,
|
|
533
|
+
tuple[int, float, float],
|
|
527
534
|
]
|
|
528
535
|
] = []
|
|
529
536
|
for candidate_fleet_model in fleet_models:
|
|
530
|
-
|
|
537
|
+
fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
|
|
531
538
|
fleet_model=candidate_fleet_model,
|
|
532
539
|
run_spec=run_spec,
|
|
533
540
|
job=job,
|
|
534
541
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
535
542
|
volumes=volumes,
|
|
536
543
|
)
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
544
|
+
fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
|
|
545
|
+
fleet_cheapest_pool_offer = math.inf
|
|
546
|
+
if len(fleet_instances_with_pool_offers) > 0:
|
|
547
|
+
fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
|
|
548
|
+
|
|
549
|
+
candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
|
|
550
|
+
profile = None
|
|
551
|
+
requirements = None
|
|
552
|
+
try:
|
|
553
|
+
profile, requirements = _get_run_profile_and_requirements_in_fleet(
|
|
554
|
+
job=job,
|
|
555
|
+
run_spec=run_spec,
|
|
556
|
+
fleet=candidate_fleet,
|
|
557
|
+
)
|
|
558
|
+
except ValueError:
|
|
559
|
+
pass
|
|
560
|
+
fleet_backend_offers = []
|
|
561
|
+
if profile is not None and requirements is not None:
|
|
562
|
+
multinode = (
|
|
563
|
+
candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
564
|
+
or job.job_spec.jobs_per_replica > 1
|
|
565
|
+
)
|
|
566
|
+
fleet_backend_offers = await get_offers_by_requirements(
|
|
567
|
+
project=project,
|
|
568
|
+
profile=profile,
|
|
569
|
+
requirements=requirements,
|
|
570
|
+
exclude_not_available=True,
|
|
571
|
+
multinode=multinode,
|
|
572
|
+
master_job_provisioning_data=master_job_provisioning_data,
|
|
573
|
+
volumes=volumes,
|
|
574
|
+
privileged=job.job_spec.privileged,
|
|
575
|
+
instance_mounts=check_run_spec_requires_instance_mounts(run_spec),
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
fleet_cheapest_backend_offer = math.inf
|
|
579
|
+
if len(fleet_backend_offers) > 0:
|
|
580
|
+
fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
|
|
581
|
+
|
|
582
|
+
fleet_priority = (
|
|
583
|
+
not fleet_has_available_capacity,
|
|
584
|
+
fleet_cheapest_pool_offer,
|
|
585
|
+
fleet_cheapest_backend_offer,
|
|
586
|
+
)
|
|
545
587
|
candidate_fleets_with_offers.append(
|
|
546
588
|
(
|
|
547
589
|
candidate_fleet_model,
|
|
548
|
-
|
|
549
|
-
len(
|
|
590
|
+
fleet_instances_with_pool_offers,
|
|
591
|
+
len(fleet_instances_with_pool_offers),
|
|
592
|
+
len(fleet_backend_offers),
|
|
550
593
|
fleet_priority,
|
|
551
594
|
)
|
|
552
595
|
)
|
|
553
596
|
if run_spec.merged_profile.fleets is None and all(
|
|
554
|
-
t[2] == 0 for t in candidate_fleets_with_offers
|
|
597
|
+
t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
|
|
555
598
|
):
|
|
556
|
-
# If fleets are not specified and no fleets have available offers, create a new fleet.
|
|
599
|
+
# If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
|
|
557
600
|
# This is for compatibility with non-fleet-first UX when runs created new fleets
|
|
558
601
|
# if there are no instances to reuse.
|
|
559
602
|
return None, []
|
|
@@ -573,7 +616,7 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
|
|
|
573
616
|
return nodes_required_num
|
|
574
617
|
|
|
575
618
|
|
|
576
|
-
def
|
|
619
|
+
def _get_fleet_instances_with_pool_offers(
|
|
577
620
|
fleet_model: FleetModel,
|
|
578
621
|
run_spec: RunSpec,
|
|
579
622
|
job: Job,
|
|
@@ -661,7 +704,7 @@ async def _run_job_on_new_instance(
|
|
|
661
704
|
master_job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
662
705
|
volumes: Optional[List[List[Volume]]] = None,
|
|
663
706
|
fleet_model: Optional[FleetModel] = None,
|
|
664
|
-
) -> Optional[
|
|
707
|
+
) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
|
|
665
708
|
if volumes is None:
|
|
666
709
|
volumes = []
|
|
667
710
|
profile = run.run_spec.merged_profile
|
|
@@ -669,21 +712,14 @@ async def _run_job_on_new_instance(
|
|
|
669
712
|
fleet = None
|
|
670
713
|
if fleet_model is not None:
|
|
671
714
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
|
|
678
|
-
if profile is None:
|
|
679
|
-
logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
|
|
680
|
-
return None
|
|
681
|
-
fleet_requirements = get_fleet_requirements(fleet.spec)
|
|
682
|
-
requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
|
|
683
|
-
if requirements is None:
|
|
684
|
-
logger.debug(
|
|
685
|
-
"%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
|
|
715
|
+
try:
|
|
716
|
+
profile, requirements = _get_run_profile_and_requirements_in_fleet(
|
|
717
|
+
job=job,
|
|
718
|
+
run_spec=run.run_spec,
|
|
719
|
+
fleet=fleet,
|
|
686
720
|
)
|
|
721
|
+
except ValueError as e:
|
|
722
|
+
logger.debug("%s: %s", fmt(job_model), e.args[0])
|
|
687
723
|
return None
|
|
688
724
|
# TODO: Respect fleet provisioning properties such as tags
|
|
689
725
|
|
|
@@ -723,7 +759,7 @@ async def _run_job_on_new_instance(
|
|
|
723
759
|
project_ssh_private_key,
|
|
724
760
|
offer_volumes,
|
|
725
761
|
)
|
|
726
|
-
return job_provisioning_data, offer
|
|
762
|
+
return job_provisioning_data, offer, profile, requirements
|
|
727
763
|
except BackendError as e:
|
|
728
764
|
logger.warning(
|
|
729
765
|
"%s: %s launch in %s/%s failed: %s",
|
|
@@ -746,6 +782,25 @@ async def _run_job_on_new_instance(
|
|
|
746
782
|
return None
|
|
747
783
|
|
|
748
784
|
|
|
785
|
+
def _get_run_profile_and_requirements_in_fleet(
|
|
786
|
+
job: Job,
|
|
787
|
+
run_spec: RunSpec,
|
|
788
|
+
fleet: Fleet,
|
|
789
|
+
) -> tuple[Profile, Requirements]:
|
|
790
|
+
if not _check_can_create_new_instance_in_fleet(fleet):
|
|
791
|
+
raise ValueError("Cannot fit new instance into fleet")
|
|
792
|
+
profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
|
|
793
|
+
if profile is None:
|
|
794
|
+
raise ValueError("Cannot combine fleet profile")
|
|
795
|
+
fleet_requirements = get_fleet_requirements(fleet.spec)
|
|
796
|
+
requirements = combine_fleet_and_run_requirements(
|
|
797
|
+
fleet_requirements, job.job_spec.requirements
|
|
798
|
+
)
|
|
799
|
+
if requirements is None:
|
|
800
|
+
raise ValueError("Cannot combine fleet requirements")
|
|
801
|
+
return profile, requirements
|
|
802
|
+
|
|
803
|
+
|
|
749
804
|
def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
|
|
750
805
|
if fleet.spec.configuration.ssh_config is not None:
|
|
751
806
|
return False
|
|
@@ -814,14 +869,12 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
|
|
|
814
869
|
def _create_instance_model_for_job(
|
|
815
870
|
project: ProjectModel,
|
|
816
871
|
fleet_model: FleetModel,
|
|
817
|
-
run_spec: RunSpec,
|
|
818
872
|
job_model: JobModel,
|
|
819
|
-
job: Job,
|
|
820
873
|
job_provisioning_data: JobProvisioningData,
|
|
821
874
|
offer: InstanceOfferWithAvailability,
|
|
822
875
|
instance_num: int,
|
|
876
|
+
profile: Profile,
|
|
823
877
|
) -> InstanceModel:
|
|
824
|
-
profile = run_spec.merged_profile
|
|
825
878
|
if not job_provisioning_data.dockerized:
|
|
826
879
|
# terminate vastai/k8s instances immediately
|
|
827
880
|
termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
|
|
@@ -345,7 +345,7 @@ async def get_instance_offers(
|
|
|
345
345
|
Returns list of instances satisfying minimal resource requirements sorted by price
|
|
346
346
|
"""
|
|
347
347
|
logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends])
|
|
348
|
-
tasks = [run_async(backend.compute().
|
|
348
|
+
tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends]
|
|
349
349
|
offers_by_backend = []
|
|
350
350
|
for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)):
|
|
351
351
|
if isinstance(result, BackendError):
|
|
@@ -13,7 +13,12 @@ from dstack._internal.core.backends.dstack.models import (
|
|
|
13
13
|
)
|
|
14
14
|
from dstack._internal.core.backends.models import BackendInfo
|
|
15
15
|
from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
|
|
16
|
-
from dstack._internal.core.models.projects import
|
|
16
|
+
from dstack._internal.core.models.projects import (
|
|
17
|
+
Member,
|
|
18
|
+
MemberPermissions,
|
|
19
|
+
Project,
|
|
20
|
+
ProjectHookConfig,
|
|
21
|
+
)
|
|
17
22
|
from dstack._internal.core.models.runs import RunStatus
|
|
18
23
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
19
24
|
from dstack._internal.server.models import (
|
|
@@ -120,6 +125,7 @@ async def create_project(
|
|
|
120
125
|
user: UserModel,
|
|
121
126
|
project_name: str,
|
|
122
127
|
is_public: bool = False,
|
|
128
|
+
config: Optional[ProjectHookConfig] = None,
|
|
123
129
|
) -> Project:
|
|
124
130
|
user_permissions = users.get_user_permissions(user)
|
|
125
131
|
if not user_permissions.can_create_projects:
|
|
@@ -147,7 +153,7 @@ async def create_project(
|
|
|
147
153
|
session=session, project_name=project_name
|
|
148
154
|
)
|
|
149
155
|
for hook in _CREATE_PROJECT_HOOKS:
|
|
150
|
-
await hook(session, project_model)
|
|
156
|
+
await hook(session, project_model, config)
|
|
151
157
|
# a hook may change project
|
|
152
158
|
session.expire(project_model)
|
|
153
159
|
project_model = await get_project_model_by_name_or_error(
|
|
@@ -609,7 +615,9 @@ def get_member_permissions(member_model: MemberModel) -> MemberPermissions:
|
|
|
609
615
|
_CREATE_PROJECT_HOOKS = []
|
|
610
616
|
|
|
611
617
|
|
|
612
|
-
def register_create_project_hook(
|
|
618
|
+
def register_create_project_hook(
|
|
619
|
+
func: Callable[[AsyncSession, ProjectModel, Optional[ProjectHookConfig]], Awaitable[None]],
|
|
620
|
+
):
|
|
613
621
|
_CREATE_PROJECT_HOOKS.append(func)
|
|
614
622
|
|
|
615
623
|
|