dstack 0.19.17__py3-none-any.whl → 0.19.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +13 -1
- dstack/_internal/core/backends/aws/compute.py +237 -18
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/cudo/compute.py +23 -9
- dstack/_internal/core/backends/gcp/compute.py +13 -7
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/compatibility/fleets.py +12 -11
- dstack/_internal/core/compatibility/gateways.py +9 -8
- dstack/_internal/core/compatibility/logs.py +4 -3
- dstack/_internal/core/compatibility/runs.py +17 -20
- dstack/_internal/core/compatibility/volumes.py +9 -8
- dstack/_internal/core/errors.py +4 -0
- dstack/_internal/core/models/common.py +7 -0
- dstack/_internal/core/services/diff.py +36 -3
- dstack/_internal/server/app.py +20 -0
- dstack/_internal/server/background/__init__.py +61 -37
- dstack/_internal/server/background/tasks/process_fleets.py +19 -3
- dstack/_internal/server/background/tasks/process_gateways.py +1 -1
- dstack/_internal/server/background/tasks/process_instances.py +13 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_runs.py +8 -4
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +36 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
- dstack/_internal/server/background/tasks/process_volumes.py +2 -2
- dstack/_internal/server/services/fleets.py +5 -4
- dstack/_internal/server/services/gateways/__init__.py +4 -2
- dstack/_internal/server/services/jobs/configurators/base.py +5 -1
- dstack/_internal/server/services/locking.py +101 -12
- dstack/_internal/server/services/runs.py +24 -40
- dstack/_internal/server/services/volumes.py +2 -2
- dstack/_internal/server/settings.py +18 -4
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-d1ac2e8c38ed5f08a114.js} +68 -64
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-d1ac2e8c38ed5f08a114.js.map} +1 -1
- dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
- dstack/_internal/server/testing/common.py +7 -3
- dstack/version.py +1 -1
- {dstack-0.19.17.dist-info → dstack-0.19.18.dist-info}/METADATA +11 -10
- {dstack-0.19.17.dist-info → dstack-0.19.18.dist-info}/RECORD +43 -43
- {dstack-0.19.17.dist-info → dstack-0.19.18.dist-info}/WHEEL +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.18.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.18.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,34 +1,35 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
1
|
+
from dstack._internal.core.models.common import IncludeExcludeDictType
|
|
3
2
|
from dstack._internal.core.models.gateways import GatewayConfiguration, GatewaySpec
|
|
4
3
|
|
|
5
4
|
|
|
6
|
-
def get_gateway_spec_excludes(gateway_spec: GatewaySpec) ->
|
|
5
|
+
def get_gateway_spec_excludes(gateway_spec: GatewaySpec) -> IncludeExcludeDictType:
|
|
7
6
|
"""
|
|
8
7
|
Returns `gateway_spec` exclude mapping to exclude certain fields from the request.
|
|
9
8
|
Use this method to exclude new fields when they are not set to keep
|
|
10
9
|
clients backward-compatibility with older servers.
|
|
11
10
|
"""
|
|
12
|
-
spec_excludes = {}
|
|
11
|
+
spec_excludes: IncludeExcludeDictType = {}
|
|
13
12
|
spec_excludes["configuration"] = _get_gateway_configuration_excludes(
|
|
14
13
|
gateway_spec.configuration
|
|
15
14
|
)
|
|
16
15
|
return spec_excludes
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
def get_create_gateway_excludes(configuration: GatewayConfiguration) ->
|
|
18
|
+
def get_create_gateway_excludes(configuration: GatewayConfiguration) -> IncludeExcludeDictType:
|
|
20
19
|
"""
|
|
21
20
|
Returns an exclude mapping to exclude certain fields from the create gateway request.
|
|
22
21
|
Use this method to exclude new fields when they are not set to keep
|
|
23
22
|
clients backward-compatibility with older servers.
|
|
24
23
|
"""
|
|
25
|
-
create_gateway_excludes = {}
|
|
24
|
+
create_gateway_excludes: IncludeExcludeDictType = {}
|
|
26
25
|
create_gateway_excludes["configuration"] = _get_gateway_configuration_excludes(configuration)
|
|
27
26
|
return create_gateway_excludes
|
|
28
27
|
|
|
29
28
|
|
|
30
|
-
def _get_gateway_configuration_excludes(
|
|
31
|
-
|
|
29
|
+
def _get_gateway_configuration_excludes(
|
|
30
|
+
configuration: GatewayConfiguration,
|
|
31
|
+
) -> IncludeExcludeDictType:
|
|
32
|
+
configuration_excludes: IncludeExcludeDictType = {}
|
|
32
33
|
if configuration.tags is None:
|
|
33
34
|
configuration_excludes["tags"] = True
|
|
34
35
|
return configuration_excludes
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
|
+
from dstack._internal.core.models.common import IncludeExcludeDictType
|
|
3
4
|
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[
|
|
7
|
+
def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[IncludeExcludeDictType]:
|
|
7
8
|
"""
|
|
8
9
|
Returns exclude mapping to exclude certain fields from the request.
|
|
9
10
|
Use this method to exclude new fields when they are not set to keep
|
|
10
11
|
clients backward-compatibility with older servers.
|
|
11
12
|
"""
|
|
12
|
-
excludes = {}
|
|
13
|
+
excludes: IncludeExcludeDictType = {}
|
|
13
14
|
if request.next_token is None:
|
|
14
15
|
excludes["next_token"] = True
|
|
15
16
|
return excludes if excludes else None
|
|
@@ -1,29 +1,30 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
|
+
from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
|
|
3
4
|
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
4
5
|
from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, JobSubmission, RunSpec
|
|
5
6
|
from dstack._internal.server.schemas.runs import GetRunPlanRequest
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[
|
|
9
|
+
def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeDictType]:
|
|
9
10
|
"""
|
|
10
11
|
Returns `plan` exclude mapping to exclude certain fields from the request.
|
|
11
12
|
Use this method to exclude new fields when they are not set to keep
|
|
12
13
|
clients backward-compatibility with older servers.
|
|
13
14
|
"""
|
|
14
|
-
apply_plan_excludes = {}
|
|
15
|
+
apply_plan_excludes: IncludeExcludeDictType = {}
|
|
15
16
|
run_spec_excludes = get_run_spec_excludes(plan.run_spec)
|
|
16
17
|
if run_spec_excludes is not None:
|
|
17
18
|
apply_plan_excludes["run_spec"] = run_spec_excludes
|
|
18
19
|
current_resource = plan.current_resource
|
|
19
20
|
if current_resource is not None:
|
|
20
|
-
current_resource_excludes = {}
|
|
21
|
+
current_resource_excludes: IncludeExcludeDictType = {}
|
|
21
22
|
current_resource_excludes["status_message"] = True
|
|
22
23
|
if current_resource.deployment_num == 0:
|
|
23
24
|
current_resource_excludes["deployment_num"] = True
|
|
24
25
|
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
25
26
|
current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
|
|
26
|
-
job_submissions_excludes = {}
|
|
27
|
+
job_submissions_excludes: IncludeExcludeDictType = {}
|
|
27
28
|
current_resource_excludes["jobs"] = {
|
|
28
29
|
"__all__": {
|
|
29
30
|
"job_spec": get_job_spec_excludes([job.job_spec for job in current_resource.jobs]),
|
|
@@ -45,7 +46,7 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
|
45
46
|
job_submissions_excludes["deployment_num"] = True
|
|
46
47
|
latest_job_submission = current_resource.latest_job_submission
|
|
47
48
|
if latest_job_submission is not None:
|
|
48
|
-
latest_job_submission_excludes = {}
|
|
49
|
+
latest_job_submission_excludes: IncludeExcludeDictType = {}
|
|
49
50
|
current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
|
|
50
51
|
if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
|
|
51
52
|
latest_job_submission_excludes["job_provisioning_data"] = {
|
|
@@ -62,12 +63,12 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
|
62
63
|
return {"plan": apply_plan_excludes}
|
|
63
64
|
|
|
64
65
|
|
|
65
|
-
def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[
|
|
66
|
+
def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[IncludeExcludeDictType]:
|
|
66
67
|
"""
|
|
67
68
|
Excludes new fields when they are not set to keep
|
|
68
69
|
clients backward-compatibility with older servers.
|
|
69
70
|
"""
|
|
70
|
-
get_plan_excludes = {}
|
|
71
|
+
get_plan_excludes: IncludeExcludeDictType = {}
|
|
71
72
|
run_spec_excludes = get_run_spec_excludes(request.run_spec)
|
|
72
73
|
if run_spec_excludes is not None:
|
|
73
74
|
get_plan_excludes["run_spec"] = run_spec_excludes
|
|
@@ -76,15 +77,15 @@ def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
|
|
|
76
77
|
return get_plan_excludes
|
|
77
78
|
|
|
78
79
|
|
|
79
|
-
def get_run_spec_excludes(run_spec: RunSpec) ->
|
|
80
|
+
def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
|
|
80
81
|
"""
|
|
81
82
|
Returns `run_spec` exclude mapping to exclude certain fields from the request.
|
|
82
83
|
Use this method to exclude new fields when they are not set to keep
|
|
83
84
|
clients backward-compatibility with older servers.
|
|
84
85
|
"""
|
|
85
|
-
spec_excludes:
|
|
86
|
-
configuration_excludes:
|
|
87
|
-
profile_excludes:
|
|
86
|
+
spec_excludes: IncludeExcludeDictType = {}
|
|
87
|
+
configuration_excludes: IncludeExcludeDictType = {}
|
|
88
|
+
profile_excludes: IncludeExcludeSetType = set()
|
|
88
89
|
configuration = run_spec.configuration
|
|
89
90
|
profile = run_spec.profile
|
|
90
91
|
|
|
@@ -121,18 +122,16 @@ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
|
|
|
121
122
|
spec_excludes["configuration"] = configuration_excludes
|
|
122
123
|
if profile_excludes:
|
|
123
124
|
spec_excludes["profile"] = profile_excludes
|
|
124
|
-
|
|
125
|
-
return spec_excludes
|
|
126
|
-
return None
|
|
125
|
+
return spec_excludes
|
|
127
126
|
|
|
128
127
|
|
|
129
|
-
def get_job_spec_excludes(job_specs: list[JobSpec]) ->
|
|
128
|
+
def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
|
|
130
129
|
"""
|
|
131
130
|
Returns `job_spec` exclude mapping to exclude certain fields from the request.
|
|
132
131
|
Use this method to exclude new fields when they are not set to keep
|
|
133
132
|
clients backward-compatibility with older servers.
|
|
134
133
|
"""
|
|
135
|
-
spec_excludes:
|
|
134
|
+
spec_excludes: IncludeExcludeDictType = {}
|
|
136
135
|
|
|
137
136
|
if all(s.repo_code_hash is None for s in job_specs):
|
|
138
137
|
spec_excludes["repo_code_hash"] = True
|
|
@@ -141,9 +140,7 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> Optional[dict]:
|
|
|
141
140
|
if all(not s.file_archives for s in job_specs):
|
|
142
141
|
spec_excludes["file_archives"] = True
|
|
143
142
|
|
|
144
|
-
|
|
145
|
-
return spec_excludes
|
|
146
|
-
return None
|
|
143
|
+
return spec_excludes
|
|
147
144
|
|
|
148
145
|
|
|
149
146
|
def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
@@ -1,32 +1,33 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
1
|
+
from dstack._internal.core.models.common import IncludeExcludeDictType
|
|
3
2
|
from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
|
|
4
3
|
|
|
5
4
|
|
|
6
|
-
def get_volume_spec_excludes(volume_spec: VolumeSpec) ->
|
|
5
|
+
def get_volume_spec_excludes(volume_spec: VolumeSpec) -> IncludeExcludeDictType:
|
|
7
6
|
"""
|
|
8
7
|
Returns `volume_spec` exclude mapping to exclude certain fields from the request.
|
|
9
8
|
Use this method to exclude new fields when they are not set to keep
|
|
10
9
|
clients backward-compatibility with older servers.
|
|
11
10
|
"""
|
|
12
|
-
spec_excludes = {}
|
|
11
|
+
spec_excludes: IncludeExcludeDictType = {}
|
|
13
12
|
spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
|
|
14
13
|
return spec_excludes
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
def get_create_volume_excludes(configuration: VolumeConfiguration) ->
|
|
16
|
+
def get_create_volume_excludes(configuration: VolumeConfiguration) -> IncludeExcludeDictType:
|
|
18
17
|
"""
|
|
19
18
|
Returns an exclude mapping to exclude certain fields from the create volume request.
|
|
20
19
|
Use this method to exclude new fields when they are not set to keep
|
|
21
20
|
clients backward-compatibility with older servers.
|
|
22
21
|
"""
|
|
23
|
-
create_volume_excludes = {}
|
|
22
|
+
create_volume_excludes: IncludeExcludeDictType = {}
|
|
24
23
|
create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
|
|
25
24
|
return create_volume_excludes
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
def _get_volume_configuration_excludes(
|
|
29
|
-
|
|
27
|
+
def _get_volume_configuration_excludes(
|
|
28
|
+
configuration: VolumeConfiguration,
|
|
29
|
+
) -> IncludeExcludeDictType:
|
|
30
|
+
configuration_excludes: IncludeExcludeDictType = {}
|
|
30
31
|
if configuration.tags is None:
|
|
31
32
|
configuration_excludes["tags"] = True
|
|
32
33
|
return configuration_excludes
|
dstack/_internal/core/errors.py
CHANGED
|
@@ -110,6 +110,10 @@ class PlacementGroupInUseError(ComputeError):
|
|
|
110
110
|
pass
|
|
111
111
|
|
|
112
112
|
|
|
113
|
+
class PlacementGroupNotSupportedError(ComputeError):
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
|
|
113
117
|
class NotYetTerminated(ComputeError):
|
|
114
118
|
"""
|
|
115
119
|
Used by Compute.terminate_instance to signal that instance termination is not complete
|
|
@@ -6,6 +6,13 @@ from pydantic import Field
|
|
|
6
6
|
from pydantic_duality import DualBaseModel
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
|
+
IncludeExcludeFieldType = Union[int, str]
|
|
10
|
+
IncludeExcludeSetType = set[IncludeExcludeFieldType]
|
|
11
|
+
IncludeExcludeDictType = dict[
|
|
12
|
+
IncludeExcludeFieldType, Union[bool, IncludeExcludeSetType, "IncludeExcludeDictType"]
|
|
13
|
+
]
|
|
14
|
+
IncludeExcludeType = Union[IncludeExcludeSetType, IncludeExcludeDictType]
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
# DualBaseModel creates two classes for the model:
|
|
11
18
|
# one with extra = "forbid" (CoreModel/CoreModel.__request__),
|
|
@@ -1,14 +1,47 @@
|
|
|
1
|
-
from typing import Any,
|
|
1
|
+
from typing import Any, Optional, TypedDict
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
|
+
from dstack._internal.core.models.common import IncludeExcludeType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ModelFieldDiff(TypedDict):
|
|
9
|
+
old: Any
|
|
10
|
+
new: Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
ModelDiff = dict[str, ModelFieldDiff]
|
|
14
|
+
|
|
5
15
|
|
|
6
16
|
# TODO: calculate nested diffs
|
|
7
|
-
def diff_models(
|
|
17
|
+
def diff_models(
|
|
18
|
+
old: BaseModel, new: BaseModel, ignore: Optional[IncludeExcludeType] = None
|
|
19
|
+
) -> ModelDiff:
|
|
20
|
+
"""
|
|
21
|
+
Returns a diff of model instances fields.
|
|
22
|
+
|
|
23
|
+
NOTE: `ignore` is implemented as `BaseModel.parse_obj(BaseModel.dict(exclude=ignore))`,
|
|
24
|
+
that is, the "ignored" fields are actually not ignored but reset to the default values
|
|
25
|
+
before comparison, meaning that 1) any field in `ignore` must have a default value,
|
|
26
|
+
2) the default value must be equal to itself (e.g. `math.nan` != `math.nan`).
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
old: The "old" model instance.
|
|
30
|
+
new: The "new" model instance.
|
|
31
|
+
ignore: Optional fields to ignore.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
A dict of changed fields in the form of
|
|
35
|
+
`{<field_name>: {"old": old_value, "new": new_value}}`
|
|
36
|
+
"""
|
|
8
37
|
if type(old) is not type(new):
|
|
9
38
|
raise TypeError("Both instances must be of the same Pydantic model class.")
|
|
10
39
|
|
|
11
|
-
|
|
40
|
+
if ignore is not None:
|
|
41
|
+
old = type(old).parse_obj(old.dict(exclude=ignore))
|
|
42
|
+
new = type(new).parse_obj(new.dict(exclude=ignore))
|
|
43
|
+
|
|
44
|
+
changes: ModelDiff = {}
|
|
12
45
|
for field in old.__fields__:
|
|
13
46
|
old_value = getattr(old, field)
|
|
14
47
|
new_value = getattr(new, field)
|
dstack/_internal/server/app.py
CHANGED
|
@@ -2,6 +2,7 @@ import asyncio
|
|
|
2
2
|
import importlib.resources
|
|
3
3
|
import os
|
|
4
4
|
import time
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
6
|
from contextlib import asynccontextmanager
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Awaitable, Callable, List
|
|
@@ -97,6 +98,8 @@ def create_app() -> FastAPI:
|
|
|
97
98
|
@asynccontextmanager
|
|
98
99
|
async def lifespan(app: FastAPI):
|
|
99
100
|
configure_logging()
|
|
101
|
+
server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
|
|
102
|
+
asyncio.get_running_loop().set_default_executor(server_executor)
|
|
100
103
|
await migrate()
|
|
101
104
|
_print_dstack_logo()
|
|
102
105
|
if not check_required_ssh_version():
|
|
@@ -242,6 +245,23 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
242
245
|
)
|
|
243
246
|
return response
|
|
244
247
|
|
|
248
|
+
if settings.SERVER_PROFILING_ENABLED:
|
|
249
|
+
from pyinstrument import Profiler
|
|
250
|
+
|
|
251
|
+
@app.middleware("http")
|
|
252
|
+
async def profile_request(request: Request, call_next):
|
|
253
|
+
profiling = request.query_params.get("profile", False)
|
|
254
|
+
if profiling:
|
|
255
|
+
profiler = Profiler()
|
|
256
|
+
profiler.start()
|
|
257
|
+
respone = await call_next(request)
|
|
258
|
+
profiler.stop()
|
|
259
|
+
with open("profiling_results.html", "w+") as f:
|
|
260
|
+
f.write(profiler.output_html())
|
|
261
|
+
return respone
|
|
262
|
+
else:
|
|
263
|
+
return await call_next(request)
|
|
264
|
+
|
|
245
265
|
# this middleware must be defined after the log_request middleware
|
|
246
266
|
@app.middleware("http")
|
|
247
267
|
async def log_http_metrics(request: Request, call_next):
|
|
@@ -37,15 +37,31 @@ def get_scheduler() -> AsyncIOScheduler:
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def start_background_tasks() -> AsyncIOScheduler:
|
|
40
|
+
# We try to process as many resources as possible without exhausting DB connections.
|
|
41
|
+
#
|
|
42
|
+
# Quick tasks can process multiple resources per transaction.
|
|
43
|
+
# Potentially long tasks process one resource per transaction
|
|
44
|
+
# to avoid holding locks for all the resources if one is slow to process.
|
|
45
|
+
# Still, the next batch won't be processed unless all resources are processed,
|
|
46
|
+
# so larger batches do not increase processing rate linearly.
|
|
47
|
+
#
|
|
48
|
+
# The interval, batch_size, and max_instances determine background tasks processing rates.
|
|
49
|
+
# By default, one server replica can handle:
|
|
50
|
+
#
|
|
51
|
+
# * 150 active jobs with 2 minutes processing latency
|
|
52
|
+
# * 150 active runs with 2 minutes processing latency
|
|
53
|
+
# * 150 active instances with 2 minutes processing latency
|
|
54
|
+
#
|
|
55
|
+
# These latency numbers do not account for provisioning time,
|
|
56
|
+
# so it may be slower if a backend is slow to provision.
|
|
57
|
+
#
|
|
58
|
+
# Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica.
|
|
59
|
+
# They also need to increase max db connections on the client side and db side.
|
|
60
|
+
#
|
|
40
61
|
# In-memory locking via locksets does not guarantee
|
|
41
62
|
# that the first waiting for the lock will acquire it.
|
|
42
63
|
# The jitter is needed to give all tasks a chance to acquire locks.
|
|
43
64
|
|
|
44
|
-
# The batch_size and interval determine background tasks processing rates.
|
|
45
|
-
# Currently one server replica can handle:
|
|
46
|
-
# * 150 active jobs with up to 2 minutes processing latency
|
|
47
|
-
# * 150 active runs with up to 2 minutes processing latency
|
|
48
|
-
# * 150 active instances with up to 2 minutes processing latency
|
|
49
65
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
50
66
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
51
67
|
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
@@ -53,38 +69,6 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
53
69
|
collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
|
|
54
70
|
)
|
|
55
71
|
_scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
56
|
-
# process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
|
|
57
|
-
_scheduler.add_job(
|
|
58
|
-
process_submitted_jobs,
|
|
59
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
60
|
-
kwargs={"batch_size": 5},
|
|
61
|
-
max_instances=2,
|
|
62
|
-
)
|
|
63
|
-
_scheduler.add_job(
|
|
64
|
-
process_running_jobs,
|
|
65
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
66
|
-
kwargs={"batch_size": 5},
|
|
67
|
-
max_instances=2,
|
|
68
|
-
)
|
|
69
|
-
_scheduler.add_job(
|
|
70
|
-
process_terminating_jobs,
|
|
71
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
72
|
-
kwargs={"batch_size": 5},
|
|
73
|
-
max_instances=2,
|
|
74
|
-
)
|
|
75
|
-
_scheduler.add_job(
|
|
76
|
-
process_runs,
|
|
77
|
-
IntervalTrigger(seconds=2, jitter=1),
|
|
78
|
-
kwargs={"batch_size": 5},
|
|
79
|
-
max_instances=2,
|
|
80
|
-
)
|
|
81
|
-
_scheduler.add_job(
|
|
82
|
-
process_instances,
|
|
83
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
84
|
-
kwargs={"batch_size": 5},
|
|
85
|
-
max_instances=2,
|
|
86
|
-
)
|
|
87
|
-
_scheduler.add_job(process_fleets, IntervalTrigger(seconds=10, jitter=2))
|
|
88
72
|
_scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
|
|
89
73
|
_scheduler.add_job(
|
|
90
74
|
process_submitted_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5
|
|
@@ -93,5 +77,45 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
93
77
|
process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
|
|
94
78
|
)
|
|
95
79
|
_scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
|
|
80
|
+
for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
|
|
81
|
+
# Add multiple copies of tasks if requested.
|
|
82
|
+
# max_instances=1 for additional copies to avoid running too many tasks.
|
|
83
|
+
# Move other tasks here when they need per-replica scaling.
|
|
84
|
+
_scheduler.add_job(
|
|
85
|
+
process_submitted_jobs,
|
|
86
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
87
|
+
kwargs={"batch_size": 5},
|
|
88
|
+
max_instances=4 if replica == 0 else 1,
|
|
89
|
+
)
|
|
90
|
+
_scheduler.add_job(
|
|
91
|
+
process_running_jobs,
|
|
92
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
93
|
+
kwargs={"batch_size": 5},
|
|
94
|
+
max_instances=2 if replica == 0 else 1,
|
|
95
|
+
)
|
|
96
|
+
_scheduler.add_job(
|
|
97
|
+
process_terminating_jobs,
|
|
98
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
99
|
+
kwargs={"batch_size": 5},
|
|
100
|
+
max_instances=2 if replica == 0 else 1,
|
|
101
|
+
)
|
|
102
|
+
_scheduler.add_job(
|
|
103
|
+
process_runs,
|
|
104
|
+
IntervalTrigger(seconds=2, jitter=1),
|
|
105
|
+
kwargs={"batch_size": 5},
|
|
106
|
+
max_instances=2 if replica == 0 else 1,
|
|
107
|
+
)
|
|
108
|
+
_scheduler.add_job(
|
|
109
|
+
process_instances,
|
|
110
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
111
|
+
kwargs={"batch_size": 5},
|
|
112
|
+
max_instances=2 if replica == 0 else 1,
|
|
113
|
+
)
|
|
114
|
+
_scheduler.add_job(
|
|
115
|
+
process_fleets,
|
|
116
|
+
IntervalTrigger(seconds=10, jitter=2),
|
|
117
|
+
kwargs={"batch_size": 5},
|
|
118
|
+
max_instances=2 if replica == 0 else 1,
|
|
119
|
+
)
|
|
96
120
|
_scheduler.start()
|
|
97
121
|
return _scheduler
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from datetime import timedelta
|
|
3
|
+
|
|
1
4
|
from sqlalchemy import select
|
|
2
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
3
6
|
from sqlalchemy.orm import joinedload
|
|
4
7
|
|
|
5
8
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
6
|
-
from dstack._internal.server.db import get_session_ctx
|
|
9
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
7
10
|
from dstack._internal.server.models import FleetModel
|
|
8
11
|
from dstack._internal.server.services.fleets import (
|
|
9
12
|
is_fleet_empty,
|
|
@@ -17,8 +20,18 @@ from dstack._internal.utils.logging import get_logger
|
|
|
17
20
|
logger = get_logger(__name__)
|
|
18
21
|
|
|
19
22
|
|
|
20
|
-
|
|
21
|
-
|
|
23
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def process_fleets(batch_size: int = 1):
|
|
27
|
+
tasks = []
|
|
28
|
+
for _ in range(batch_size):
|
|
29
|
+
tasks.append(_process_next_fleet())
|
|
30
|
+
await asyncio.gather(*tasks)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def _process_next_fleet():
|
|
34
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
|
|
22
35
|
async with get_session_ctx() as session:
|
|
23
36
|
async with lock:
|
|
24
37
|
res = await session.execute(
|
|
@@ -26,6 +39,8 @@ async def process_fleets():
|
|
|
26
39
|
.where(
|
|
27
40
|
FleetModel.deleted == False,
|
|
28
41
|
FleetModel.id.not_in(lockset),
|
|
42
|
+
FleetModel.last_processed_at
|
|
43
|
+
< get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
|
|
29
44
|
)
|
|
30
45
|
.order_by(FleetModel.last_processed_at.asc())
|
|
31
46
|
.limit(1)
|
|
@@ -43,6 +58,7 @@ async def process_fleets():
|
|
|
43
58
|
|
|
44
59
|
|
|
45
60
|
async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
|
|
61
|
+
logger.debug("Processing fleet %s", fleet_model.name)
|
|
46
62
|
# Refetch to load related attributes.
|
|
47
63
|
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
48
64
|
res = await session.execute(
|
|
@@ -28,7 +28,7 @@ async def process_gateways_connections():
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
async def process_submitted_gateways():
|
|
31
|
-
lock, lockset = get_locker().get_lockset(GatewayModel.__tablename__)
|
|
31
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
|
|
32
32
|
async with get_session_ctx() as session:
|
|
33
33
|
async with lock:
|
|
34
34
|
res = await session.execute(
|
|
@@ -45,6 +45,7 @@ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
|
45
45
|
from dstack._internal.core.errors import (
|
|
46
46
|
BackendError,
|
|
47
47
|
NotYetTerminated,
|
|
48
|
+
PlacementGroupNotSupportedError,
|
|
48
49
|
ProvisioningError,
|
|
49
50
|
)
|
|
50
51
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -73,7 +74,7 @@ from dstack._internal.core.models.runs import (
|
|
|
73
74
|
from dstack._internal.core.services.profiles import get_retry
|
|
74
75
|
from dstack._internal.server import settings as server_settings
|
|
75
76
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
76
|
-
from dstack._internal.server.db import get_session_ctx
|
|
77
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
77
78
|
from dstack._internal.server.models import (
|
|
78
79
|
FleetModel,
|
|
79
80
|
InstanceModel,
|
|
@@ -110,6 +111,8 @@ from dstack._internal.utils.ssh import (
|
|
|
110
111
|
pkey_from_str,
|
|
111
112
|
)
|
|
112
113
|
|
|
114
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
|
|
115
|
+
|
|
113
116
|
PENDING_JOB_RETRY_INTERVAL = timedelta(seconds=60)
|
|
114
117
|
|
|
115
118
|
TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20)
|
|
@@ -129,7 +132,7 @@ async def process_instances(batch_size: int = 1):
|
|
|
129
132
|
|
|
130
133
|
|
|
131
134
|
async def _process_next_instance():
|
|
132
|
-
lock, lockset = get_locker().get_lockset(InstanceModel.__tablename__)
|
|
135
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__)
|
|
133
136
|
async with get_session_ctx() as session:
|
|
134
137
|
async with lock:
|
|
135
138
|
res = await session.execute(
|
|
@@ -145,6 +148,8 @@ async def _process_next_instance():
|
|
|
145
148
|
]
|
|
146
149
|
),
|
|
147
150
|
InstanceModel.id.not_in(lockset),
|
|
151
|
+
InstanceModel.last_processed_at
|
|
152
|
+
< get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
|
|
148
153
|
)
|
|
149
154
|
.options(lazyload(InstanceModel.jobs))
|
|
150
155
|
.order_by(InstanceModel.last_processed_at.asc())
|
|
@@ -1063,6 +1068,12 @@ async def _create_placement_group(
|
|
|
1063
1068
|
placement_group_model_to_placement_group(placement_group_model),
|
|
1064
1069
|
master_instance_offer,
|
|
1065
1070
|
)
|
|
1071
|
+
except PlacementGroupNotSupportedError:
|
|
1072
|
+
logger.debug(
|
|
1073
|
+
"Skipping offer %s because placement group not supported",
|
|
1074
|
+
master_instance_offer.instance.name,
|
|
1075
|
+
)
|
|
1076
|
+
return None
|
|
1066
1077
|
except BackendError as e:
|
|
1067
1078
|
logger.warning(
|
|
1068
1079
|
"Failed to create placement group %s in %s/%s: %r",
|
|
@@ -7,7 +7,7 @@ from sqlalchemy.orm import joinedload
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
9
9
|
from dstack._internal.core.errors import PlacementGroupInUseError
|
|
10
|
-
from dstack._internal.server.db import get_session_ctx
|
|
10
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
11
11
|
from dstack._internal.server.models import PlacementGroupModel, ProjectModel
|
|
12
12
|
from dstack._internal.server.services import backends as backends_services
|
|
13
13
|
from dstack._internal.server.services.locking import get_locker
|
|
@@ -19,7 +19,9 @@ logger = get_logger(__name__)
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
async def process_placement_groups():
|
|
22
|
-
lock, lockset = get_locker().get_lockset(
|
|
22
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
23
|
+
PlacementGroupModel.__tablename__
|
|
24
|
+
)
|
|
23
25
|
async with get_session_ctx() as session:
|
|
24
26
|
async with lock:
|
|
25
27
|
res = await session.execute(
|
|
@@ -34,10 +34,11 @@ from dstack._internal.core.models.runs import (
|
|
|
34
34
|
JobTerminationReason,
|
|
35
35
|
Run,
|
|
36
36
|
RunSpec,
|
|
37
|
+
RunStatus,
|
|
37
38
|
)
|
|
38
39
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
39
40
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
40
|
-
from dstack._internal.server.db import get_session_ctx
|
|
41
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
41
42
|
from dstack._internal.server.models import (
|
|
42
43
|
InstanceModel,
|
|
43
44
|
JobModel,
|
|
@@ -79,6 +80,7 @@ from dstack._internal.utils.logging import get_logger
|
|
|
79
80
|
logger = get_logger(__name__)
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
|
|
82
84
|
# Minimum time before terminating active job in case of connectivity issues.
|
|
83
85
|
# Should be sufficient to survive most problems caused by
|
|
84
86
|
# the server network flickering and providers' glitches.
|
|
@@ -93,20 +95,29 @@ async def process_running_jobs(batch_size: int = 1):
|
|
|
93
95
|
|
|
94
96
|
|
|
95
97
|
async def _process_next_running_job():
|
|
96
|
-
lock, lockset = get_locker().get_lockset(JobModel.__tablename__)
|
|
98
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
97
99
|
async with get_session_ctx() as session:
|
|
98
100
|
async with lock:
|
|
99
101
|
res = await session.execute(
|
|
100
102
|
select(JobModel)
|
|
103
|
+
.join(JobModel.run)
|
|
101
104
|
.where(
|
|
102
105
|
JobModel.status.in_(
|
|
103
106
|
[JobStatus.PROVISIONING, JobStatus.PULLING, JobStatus.RUNNING]
|
|
104
107
|
),
|
|
108
|
+
RunModel.status.not_in([RunStatus.TERMINATING]),
|
|
105
109
|
JobModel.id.not_in(lockset),
|
|
110
|
+
JobModel.last_processed_at
|
|
111
|
+
< common_utils.get_current_datetime().replace(tzinfo=None)
|
|
112
|
+
- MIN_PROCESSING_INTERVAL,
|
|
106
113
|
)
|
|
107
114
|
.order_by(JobModel.last_processed_at.asc())
|
|
108
115
|
.limit(1)
|
|
109
|
-
.with_for_update(
|
|
116
|
+
.with_for_update(
|
|
117
|
+
skip_locked=True,
|
|
118
|
+
key_share=True,
|
|
119
|
+
of=JobModel,
|
|
120
|
+
)
|
|
110
121
|
)
|
|
111
122
|
job_model = res.unique().scalar()
|
|
112
123
|
if job_model is None:
|