dstack 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -0
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +1 -0
- dstack/_internal/core/backends/azure/compute.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +1 -1
- dstack/_internal/core/backends/runpod/compute.py +21 -3
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -1
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +66 -19
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +13 -3
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4eb116b97819badd1e2c.js} +66 -13
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +17 -0
- dstack/api/_public/runs.py +3 -0
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/RECORD +59 -50
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +125 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +79 -56
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -95,6 +95,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
95
95
|
reservation=profile.reservation,
|
|
96
96
|
spot_policy=profile.spot_policy,
|
|
97
97
|
retry_policy=profile.retry_policy,
|
|
98
|
+
utilization_policy=profile.utilization_policy,
|
|
98
99
|
max_duration=profile.max_duration,
|
|
99
100
|
stop_duration=profile.stop_duration,
|
|
100
101
|
max_price=profile.max_price,
|
|
@@ -4,6 +4,8 @@ from rich.markup import escape
|
|
|
4
4
|
from rich.table import Table
|
|
5
5
|
|
|
6
6
|
from dstack._internal.cli.utils.common import NO_OFFERS_WARNING, add_row_from_dict, console
|
|
7
|
+
from dstack._internal.core.models.common import is_core_model_instance
|
|
8
|
+
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
7
9
|
from dstack._internal.core.models.instances import InstanceAvailability
|
|
8
10
|
from dstack._internal.core.models.profiles import (
|
|
9
11
|
DEFAULT_RUN_TERMINATION_IDLE_TIME,
|
|
@@ -38,6 +40,13 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3):
|
|
|
38
40
|
if job_plan.job_spec.max_duration
|
|
39
41
|
else "-"
|
|
40
42
|
)
|
|
43
|
+
inactivity_duration = None
|
|
44
|
+
if is_core_model_instance(run_plan.run_spec.configuration, DevEnvironmentConfiguration):
|
|
45
|
+
inactivity_duration = "-"
|
|
46
|
+
if isinstance(run_plan.run_spec.configuration.inactivity_duration, int):
|
|
47
|
+
inactivity_duration = format_pretty_duration(
|
|
48
|
+
run_plan.run_spec.configuration.inactivity_duration
|
|
49
|
+
)
|
|
41
50
|
if job_plan.job_spec.retry is None:
|
|
42
51
|
retry = "-"
|
|
43
52
|
else:
|
|
@@ -72,6 +81,8 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3):
|
|
|
72
81
|
props.add_row(th("Resources"), pretty_req)
|
|
73
82
|
props.add_row(th("Max price"), max_price)
|
|
74
83
|
props.add_row(th("Max duration"), max_duration)
|
|
84
|
+
if inactivity_duration is not None: # None means n/a
|
|
85
|
+
props.add_row(th("Inactivity duration"), inactivity_duration)
|
|
75
86
|
props.add_row(th("Spot policy"), spot_policy)
|
|
76
87
|
props.add_row(th("Retry policy"), retry)
|
|
77
88
|
props.add_row(th("Creation policy"), creation_policy)
|
|
@@ -312,7 +312,7 @@ def get_resource_group_network_subnet_or_error(
|
|
|
312
312
|
except Exception:
|
|
313
313
|
raise ComputeError(
|
|
314
314
|
"Network specified in incorrect format."
|
|
315
|
-
" Supported format for `
|
|
315
|
+
" Supported format for `vpc_ids` values: 'networkResourceGroupName/networkName'"
|
|
316
316
|
)
|
|
317
317
|
elif resource_group is not None:
|
|
318
318
|
network_name = azure_utils.get_default_network_name(resource_group, location)
|
|
@@ -580,7 +580,7 @@ class GCPCompute(Compute):
|
|
|
580
580
|
operation = self.disk_client.delete(
|
|
581
581
|
project=self.config.project_id,
|
|
582
582
|
zone=get_or_error(volume.provisioning_data).availability_zone,
|
|
583
|
-
disk=volume.
|
|
583
|
+
disk=volume.volume_id,
|
|
584
584
|
)
|
|
585
585
|
gcp_resources.wait_for_extended_operation(operation, "persistent disk deletion")
|
|
586
586
|
except google.api_core.exceptions.NotFound:
|
|
@@ -52,8 +52,9 @@ class RunpodCompute(Compute):
|
|
|
52
52
|
) -> List[InstanceOfferWithAvailability]:
|
|
53
53
|
offers = get_catalog_offers(
|
|
54
54
|
backend=BackendType.RUNPOD,
|
|
55
|
-
locations=self.config.regions,
|
|
55
|
+
locations=self.config.regions or None,
|
|
56
56
|
requirements=requirements,
|
|
57
|
+
extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
|
|
57
58
|
)
|
|
58
59
|
offers = [
|
|
59
60
|
InstanceOfferWithAvailability(
|
|
@@ -102,13 +103,22 @@ class RunpodCompute(Compute):
|
|
|
102
103
|
bid_per_gpu = None
|
|
103
104
|
if instance_offer.instance.resources.spot and gpu_count:
|
|
104
105
|
bid_per_gpu = instance_offer.price / gpu_count
|
|
106
|
+
if _is_secure_cloud(instance_offer.region):
|
|
107
|
+
cloud_type = "SECURE"
|
|
108
|
+
data_center_id = instance_offer.region
|
|
109
|
+
country_code = None
|
|
110
|
+
else:
|
|
111
|
+
cloud_type = "COMMUNITY"
|
|
112
|
+
data_center_id = None
|
|
113
|
+
country_code = instance_offer.region
|
|
105
114
|
|
|
106
115
|
resp = self.api_client.create_pod(
|
|
107
116
|
name=pod_name,
|
|
108
117
|
image_name=job.job_spec.image_name,
|
|
109
118
|
gpu_type_id=instance_offer.instance.name,
|
|
110
|
-
cloud_type=
|
|
111
|
-
data_center_id=
|
|
119
|
+
cloud_type=cloud_type,
|
|
120
|
+
data_center_id=data_center_id,
|
|
121
|
+
country_code=country_code,
|
|
112
122
|
gpu_count=gpu_count,
|
|
113
123
|
container_disk_in_gb=disk_size,
|
|
114
124
|
min_vcpu_count=instance_offer.instance.resources.cpus,
|
|
@@ -257,3 +267,11 @@ def _get_volume_price(size: int) -> float:
|
|
|
257
267
|
if size < 1000:
|
|
258
268
|
return 0.07 * size
|
|
259
269
|
return 0.05 * size
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _is_secure_cloud(region: str) -> str:
|
|
273
|
+
"""
|
|
274
|
+
Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
|
|
275
|
+
Community cloud regions are country codes: CA, NL, etc.
|
|
276
|
+
"""
|
|
277
|
+
return "-" in region
|
|
@@ -4,6 +4,14 @@ from dstack._internal.core.models.backends.runpod import (
|
|
|
4
4
|
RunpodStoredConfig,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
+
RUNPOD_COMMUNITY_CLOUD_DEFAULT = True
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class RunpodConfig(RunpodStoredConfig, BackendConfig):
|
|
9
11
|
creds: AnyRunpodCreds
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def allow_community_cloud(self) -> bool:
|
|
15
|
+
if self.community_cloud is not None:
|
|
16
|
+
return self.community_cloud
|
|
17
|
+
return RUNPOD_COMMUNITY_CLOUD_DEFAULT
|
|
@@ -10,6 +10,7 @@ from dstack._internal.core.models.common import CoreModel
|
|
|
10
10
|
class RunpodConfigInfo(CoreModel):
|
|
11
11
|
type: Literal["runpod"] = "runpod"
|
|
12
12
|
regions: Optional[List[str]] = None
|
|
13
|
+
community_cloud: Optional[bool] = None
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
class RunpodStoredConfig(RunpodConfigInfo):
|
|
@@ -33,6 +34,7 @@ class RunpodConfigInfoWithCredsPartial(CoreModel):
|
|
|
33
34
|
type: Literal["runpod"] = "runpod"
|
|
34
35
|
creds: Optional[AnyRunpodCreds]
|
|
35
36
|
regions: Optional[List[str]]
|
|
37
|
+
community_cloud: Optional[bool]
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class RunpodConfigValues(CoreModel):
|
|
@@ -221,7 +221,8 @@ class DevEnvironmentConfigurationParams(CoreModel):
|
|
|
221
221
|
" Inactivity is defined as the absence of SSH connections to the"
|
|
222
222
|
" dev environment, including VS Code connections, `ssh <run name>`"
|
|
223
223
|
" shells, and attached `dstack apply` or `dstack attach` commands."
|
|
224
|
-
" Use `off` for unlimited duration.
|
|
224
|
+
" Use `off` for unlimited duration. Can be updated in-place."
|
|
225
|
+
" Defaults to `off`"
|
|
225
226
|
)
|
|
226
227
|
),
|
|
227
228
|
]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import List, Optional, Union
|
|
2
|
+
from typing import List, Optional, Union, overload
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, root_validator, validator
|
|
5
5
|
from typing_extensions import Annotated, Literal
|
|
@@ -34,6 +34,14 @@ class TerminationPolicy(str, Enum):
|
|
|
34
34
|
DESTROY_AFTER_IDLE = "destroy-after-idle"
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
@overload
|
|
38
|
+
def parse_duration(v: None) -> None: ...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@overload
|
|
42
|
+
def parse_duration(v: Union[int, str]) -> int: ...
|
|
43
|
+
|
|
44
|
+
|
|
37
45
|
def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
|
|
38
46
|
if v is None:
|
|
39
47
|
return None
|
|
@@ -112,6 +120,39 @@ class ProfileRetry(CoreModel):
|
|
|
112
120
|
return values
|
|
113
121
|
|
|
114
122
|
|
|
123
|
+
class UtilizationPolicy(CoreModel):
|
|
124
|
+
_min_time_window = "5m"
|
|
125
|
+
|
|
126
|
+
min_gpu_utilization: Annotated[
|
|
127
|
+
int,
|
|
128
|
+
Field(
|
|
129
|
+
description=(
|
|
130
|
+
"Minimum required GPU utilization, percent."
|
|
131
|
+
" If any GPU has utilization below specified value during the whole time window,"
|
|
132
|
+
" the run is terminated"
|
|
133
|
+
),
|
|
134
|
+
ge=0,
|
|
135
|
+
le=100,
|
|
136
|
+
),
|
|
137
|
+
]
|
|
138
|
+
time_window: Annotated[
|
|
139
|
+
Union[int, str],
|
|
140
|
+
Field(
|
|
141
|
+
description=(
|
|
142
|
+
"The time window of metric samples taking into account to measure utilization"
|
|
143
|
+
f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`"
|
|
144
|
+
)
|
|
145
|
+
),
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
@validator("time_window", pre=True)
|
|
149
|
+
def validate_time_window(cls, v: Union[int, str]) -> int:
|
|
150
|
+
v = parse_duration(v)
|
|
151
|
+
if v < parse_duration(cls._min_time_window):
|
|
152
|
+
raise ValueError(f"Minimum time_window is {cls._min_time_window}")
|
|
153
|
+
return v
|
|
154
|
+
|
|
155
|
+
|
|
115
156
|
class ProfileParams(CoreModel):
|
|
116
157
|
backends: Annotated[
|
|
117
158
|
Optional[List[BackendType]],
|
|
@@ -194,6 +235,10 @@ class ProfileParams(CoreModel):
|
|
|
194
235
|
)
|
|
195
236
|
),
|
|
196
237
|
]
|
|
238
|
+
utilization_policy: Annotated[
|
|
239
|
+
Optional[UtilizationPolicy],
|
|
240
|
+
Field(description="Run termination policy based on utilization"),
|
|
241
|
+
]
|
|
197
242
|
# Deprecated:
|
|
198
243
|
termination_policy: Annotated[
|
|
199
244
|
Optional[TerminationPolicy],
|
|
@@ -23,6 +23,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
23
23
|
ProfileRetryPolicy,
|
|
24
24
|
RetryEvent,
|
|
25
25
|
SpotPolicy,
|
|
26
|
+
UtilizationPolicy,
|
|
26
27
|
)
|
|
27
28
|
from dstack._internal.core.models.repos import AnyRunRepoData
|
|
28
29
|
from dstack._internal.core.models.resources import Memory, ResourcesSpec
|
|
@@ -114,6 +115,7 @@ class JobTerminationReason(str, Enum):
|
|
|
114
115
|
ABORTED_BY_USER = "aborted_by_user"
|
|
115
116
|
TERMINATED_BY_SERVER = "terminated_by_server"
|
|
116
117
|
INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
|
|
118
|
+
TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy"
|
|
117
119
|
# Set by the runner
|
|
118
120
|
CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
|
|
119
121
|
PORTS_BINDING_FAILED = "ports_binding_failed"
|
|
@@ -135,6 +137,7 @@ class JobTerminationReason(str, Enum):
|
|
|
135
137
|
self.ABORTED_BY_USER: JobStatus.ABORTED,
|
|
136
138
|
self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
|
|
137
139
|
self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
|
|
140
|
+
self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED,
|
|
138
141
|
self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
|
|
139
142
|
self.PORTS_BINDING_FAILED: JobStatus.FAILED,
|
|
140
143
|
self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
|
|
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
|
|
|
190
193
|
single_branch: Optional[bool] = None
|
|
191
194
|
max_duration: Optional[int]
|
|
192
195
|
stop_duration: Optional[int] = None
|
|
196
|
+
utilization_policy: Optional[UtilizationPolicy] = None
|
|
193
197
|
registry_auth: Optional[RegistryAuth]
|
|
194
198
|
requirements: Requirements
|
|
195
199
|
retry: Optional[Retry]
|
dstack/_internal/server/app.py
CHANGED
|
@@ -29,6 +29,7 @@ from dstack._internal.server.routers import (
|
|
|
29
29
|
metrics,
|
|
30
30
|
pools,
|
|
31
31
|
projects,
|
|
32
|
+
prometheus,
|
|
32
33
|
repos,
|
|
33
34
|
runs,
|
|
34
35
|
secrets,
|
|
@@ -185,6 +186,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
185
186
|
app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
|
|
186
187
|
app.include_router(pools.root_router)
|
|
187
188
|
app.include_router(pools.router)
|
|
189
|
+
app.include_router(prometheus.router)
|
|
188
190
|
|
|
189
191
|
@app.exception_handler(ForbiddenError)
|
|
190
192
|
async def forbidden_error_handler(request: Request, exc: ForbiddenError):
|
|
@@ -252,7 +254,11 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
252
254
|
|
|
253
255
|
@app.exception_handler(404)
|
|
254
256
|
async def custom_http_exception_handler(request, exc):
|
|
255
|
-
if
|
|
257
|
+
if (
|
|
258
|
+
request.url.path.startswith("/api")
|
|
259
|
+
or _is_proxy_request(request)
|
|
260
|
+
or _is_prometheus_request(request)
|
|
261
|
+
):
|
|
256
262
|
return JSONResponse(
|
|
257
263
|
{"detail": exc.detail},
|
|
258
264
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
@@ -283,6 +289,10 @@ def _is_proxy_request(request: Request) -> bool:
|
|
|
283
289
|
) and referrer.path.startswith("/proxy")
|
|
284
290
|
|
|
285
291
|
|
|
292
|
+
def _is_prometheus_request(request: Request) -> bool:
|
|
293
|
+
return request.url.path.startswith("/metrics")
|
|
294
|
+
|
|
295
|
+
|
|
286
296
|
def _print_dstack_logo():
|
|
287
297
|
console.print(
|
|
288
298
|
"""[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
2
2
|
from apscheduler.triggers.interval import IntervalTrigger
|
|
3
3
|
|
|
4
|
+
from dstack._internal.server import settings
|
|
4
5
|
from dstack._internal.server.background.tasks.process_fleets import process_fleets
|
|
5
6
|
from dstack._internal.server.background.tasks.process_gateways import (
|
|
6
7
|
process_gateways_connections,
|
|
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
|
|
|
16
17
|
from dstack._internal.server.background.tasks.process_placement_groups import (
|
|
17
18
|
process_placement_groups,
|
|
18
19
|
)
|
|
20
|
+
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
21
|
+
collect_prometheus_metrics,
|
|
22
|
+
delete_prometheus_metrics,
|
|
23
|
+
)
|
|
19
24
|
from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
|
|
20
25
|
from dstack._internal.server.background.tasks.process_runs import process_runs
|
|
21
26
|
from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
|
|
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
43
48
|
# * 150 active instances with up to 2 minutes processing latency
|
|
44
49
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
45
50
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
51
|
+
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
52
|
+
_scheduler.add_job(
|
|
53
|
+
collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
|
|
54
|
+
)
|
|
55
|
+
_scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
46
56
|
# process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
|
|
47
57
|
_scheduler.add_job(
|
|
48
58
|
process_submitted_jobs,
|
|
@@ -28,6 +28,7 @@ async def process_placement_groups():
|
|
|
28
28
|
PlacementGroupModel.deleted == False,
|
|
29
29
|
PlacementGroupModel.id.not_in(lockset),
|
|
30
30
|
)
|
|
31
|
+
.order_by(PlacementGroupModel.id) # take locks in order
|
|
31
32
|
.with_for_update(skip_locked=True)
|
|
32
33
|
)
|
|
33
34
|
placement_group_models = res.scalars().all()
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import sqlalchemy.exc
|
|
6
|
+
from sqlalchemy import delete, or_, select, update
|
|
7
|
+
from sqlalchemy.orm import joinedload
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
10
|
+
from dstack._internal.core.models.runs import JobStatus
|
|
11
|
+
from dstack._internal.server.db import get_session_ctx
|
|
12
|
+
from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
|
|
13
|
+
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
14
|
+
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
15
|
+
from dstack._internal.server.services.runner import client
|
|
16
|
+
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
17
|
+
from dstack._internal.server.utils.common import gather_map_async
|
|
18
|
+
from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
|
|
19
|
+
from dstack._internal.utils.logging import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
MAX_JOBS_FETCHED = 100
|
|
25
|
+
BATCH_SIZE = 10
|
|
26
|
+
MIN_COLLECT_INTERVAL_SECONDS = 9
|
|
27
|
+
# 10 minutes should be more than enough to scrape metrics, and, in any case,
|
|
28
|
+
# 10 minutes old metrics has little to no value
|
|
29
|
+
METRICS_TTL_SECONDS = 600
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def collect_prometheus_metrics():
|
|
33
|
+
now = get_current_datetime()
|
|
34
|
+
cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
|
|
35
|
+
async with get_session_ctx() as session:
|
|
36
|
+
res = await session.execute(
|
|
37
|
+
select(JobModel)
|
|
38
|
+
.join(JobPrometheusMetrics, isouter=True)
|
|
39
|
+
.where(
|
|
40
|
+
JobModel.status.in_([JobStatus.RUNNING]),
|
|
41
|
+
or_(
|
|
42
|
+
JobPrometheusMetrics.job_id.is_(None),
|
|
43
|
+
JobPrometheusMetrics.collected_at < cutoff,
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
.options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
|
|
47
|
+
.order_by(JobModel.last_processed_at.asc())
|
|
48
|
+
.limit(MAX_JOBS_FETCHED)
|
|
49
|
+
)
|
|
50
|
+
job_models = res.unique().scalars().all()
|
|
51
|
+
for batch in batched(job_models, BATCH_SIZE):
|
|
52
|
+
await _collect_jobs_metrics(batch, now)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def delete_prometheus_metrics():
|
|
56
|
+
now = get_current_datetime()
|
|
57
|
+
cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
|
|
58
|
+
async with get_session_ctx() as session:
|
|
59
|
+
await session.execute(
|
|
60
|
+
delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
|
|
61
|
+
)
|
|
62
|
+
await session.commit()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
|
|
66
|
+
results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
|
|
67
|
+
async with get_session_ctx() as session:
|
|
68
|
+
for job_model, result in results:
|
|
69
|
+
if result is None:
|
|
70
|
+
continue
|
|
71
|
+
if isinstance(result, BaseException):
|
|
72
|
+
logger.error(
|
|
73
|
+
"Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
|
|
74
|
+
)
|
|
75
|
+
continue
|
|
76
|
+
res = await session.execute(
|
|
77
|
+
update(JobPrometheusMetrics)
|
|
78
|
+
.where(JobPrometheusMetrics.job_id == job_model.id)
|
|
79
|
+
.values(
|
|
80
|
+
collected_at=collected_at,
|
|
81
|
+
text=result,
|
|
82
|
+
)
|
|
83
|
+
.returning(JobPrometheusMetrics)
|
|
84
|
+
)
|
|
85
|
+
metrics = res.scalar()
|
|
86
|
+
if metrics is None:
|
|
87
|
+
metrics = JobPrometheusMetrics(
|
|
88
|
+
job_id=job_model.id,
|
|
89
|
+
collected_at=collected_at,
|
|
90
|
+
text=result,
|
|
91
|
+
)
|
|
92
|
+
try:
|
|
93
|
+
async with session.begin_nested():
|
|
94
|
+
session.add(metrics)
|
|
95
|
+
except sqlalchemy.exc.IntegrityError:
|
|
96
|
+
# Concurrent server replica already committed, ignoring
|
|
97
|
+
pass
|
|
98
|
+
await session.commit()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
|
|
102
|
+
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
103
|
+
jpd = get_job_provisioning_data(job_model)
|
|
104
|
+
jrd = get_job_runtime_data(job_model)
|
|
105
|
+
if jpd is None:
|
|
106
|
+
return None
|
|
107
|
+
try:
|
|
108
|
+
res = await run_async(
|
|
109
|
+
_pull_job_metrics,
|
|
110
|
+
ssh_private_keys,
|
|
111
|
+
jpd,
|
|
112
|
+
jrd,
|
|
113
|
+
job_model.id,
|
|
114
|
+
)
|
|
115
|
+
except Exception:
|
|
116
|
+
logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
if isinstance(res, bool):
|
|
120
|
+
logger.warning(
|
|
121
|
+
"Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
|
|
122
|
+
)
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
if res is None:
|
|
126
|
+
# Either not supported by shim or exporter is not available
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
return res
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
|
|
133
|
+
def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
|
|
134
|
+
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
135
|
+
return shim_client.get_task_metrics(task_id)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from datetime import timedelta
|
|
2
4
|
from typing import Dict, List, Optional
|
|
3
5
|
|
|
4
6
|
from sqlalchemy import select
|
|
@@ -15,6 +17,7 @@ from dstack._internal.core.models.instances import (
|
|
|
15
17
|
RemoteConnectionInfo,
|
|
16
18
|
SSHConnectionParams,
|
|
17
19
|
)
|
|
20
|
+
from dstack._internal.core.models.metrics import Metric
|
|
18
21
|
from dstack._internal.core.models.repos import RemoteRepoCreds
|
|
19
22
|
from dstack._internal.core.models.runs import (
|
|
20
23
|
ClusterInfo,
|
|
@@ -48,6 +51,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
48
51
|
)
|
|
49
52
|
from dstack._internal.server.services.locking import get_locker
|
|
50
53
|
from dstack._internal.server.services.logging import fmt
|
|
54
|
+
from dstack._internal.server.services.metrics import get_job_metrics
|
|
51
55
|
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
52
56
|
from dstack._internal.server.services.repos import (
|
|
53
57
|
get_code_model,
|
|
@@ -343,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
343
347
|
job_model.status = JobStatus.TERMINATING
|
|
344
348
|
job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
|
|
345
349
|
|
|
350
|
+
if job_model.status == JobStatus.RUNNING:
|
|
351
|
+
await _check_gpu_utilization(session, job_model, job)
|
|
352
|
+
|
|
346
353
|
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
347
354
|
await session.commit()
|
|
348
355
|
|
|
@@ -646,27 +653,67 @@ def _terminate_if_inactivity_duration_exceeded(
|
|
|
646
653
|
run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
|
|
647
654
|
) -> None:
|
|
648
655
|
conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
|
|
649
|
-
if is_core_model_instance(conf, DevEnvironmentConfiguration)
|
|
656
|
+
if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
|
|
650
657
|
conf.inactivity_duration, int
|
|
651
658
|
):
|
|
652
|
-
|
|
653
|
-
job_model.inactivity_secs =
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
659
|
+
# reset in case inactivity_duration was disabled via in-place update
|
|
660
|
+
job_model.inactivity_secs = None
|
|
661
|
+
return
|
|
662
|
+
logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
|
|
663
|
+
job_model.inactivity_secs = no_connections_secs
|
|
664
|
+
if no_connections_secs is None:
|
|
665
|
+
# TODO(0.19 or earlier): make no_connections_secs required
|
|
666
|
+
job_model.status = JobStatus.TERMINATING
|
|
667
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
668
|
+
job_model.termination_reason_message = (
|
|
669
|
+
"The selected instance was created before dstack 0.18.41"
|
|
670
|
+
" and does not support inactivity_duration"
|
|
671
|
+
)
|
|
672
|
+
elif no_connections_secs >= conf.inactivity_duration:
|
|
673
|
+
job_model.status = JobStatus.TERMINATING
|
|
674
|
+
# TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
|
|
675
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
676
|
+
job_model.termination_reason_message = (
|
|
677
|
+
f"The job was inactive for {no_connections_secs} seconds,"
|
|
678
|
+
f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
|
|
683
|
+
policy = job.job_spec.utilization_policy
|
|
684
|
+
if policy is None:
|
|
685
|
+
return
|
|
686
|
+
after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
|
|
687
|
+
job_metrics = await get_job_metrics(session, job_model, after=after)
|
|
688
|
+
gpus_util_metrics: list[Metric] = []
|
|
689
|
+
for metric in job_metrics.metrics:
|
|
690
|
+
if metric.name.startswith("gpu_util_percent_gpu"):
|
|
691
|
+
gpus_util_metrics.append(metric)
|
|
692
|
+
if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
|
|
693
|
+
# Job has started recently, not enough points collected.
|
|
694
|
+
# Assuming that metrics collection interval less than 1 minute.
|
|
695
|
+
logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
|
|
696
|
+
return
|
|
697
|
+
if _should_terminate_due_to_low_gpu_util(
|
|
698
|
+
policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
|
|
699
|
+
):
|
|
700
|
+
logger.info("%s: GPU utilization check: terminating", fmt(job_model))
|
|
701
|
+
job_model.status = JobStatus.TERMINATING
|
|
702
|
+
# TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
|
|
703
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
704
|
+
job_model.termination_reason_message = (
|
|
705
|
+
f"The job GPU utilization below {policy.min_gpu_utilization}%"
|
|
706
|
+
f" for {policy.time_window} seconds"
|
|
707
|
+
)
|
|
708
|
+
else:
|
|
709
|
+
logger.debug("%s: GPU utilization check: OK", fmt(job_model))
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
|
|
713
|
+
for gpu_util in gpus_util:
|
|
714
|
+
if all(util < min_util for util in gpu_util):
|
|
715
|
+
return True
|
|
716
|
+
return False
|
|
670
717
|
|
|
671
718
|
|
|
672
719
|
def _get_cluster_info(
|
|
@@ -35,6 +35,7 @@ from dstack._internal.core.models.runs import (
|
|
|
35
35
|
)
|
|
36
36
|
from dstack._internal.core.models.volumes import Volume
|
|
37
37
|
from dstack._internal.core.services.profiles import get_termination
|
|
38
|
+
from dstack._internal.server import settings
|
|
38
39
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
39
40
|
from dstack._internal.server.models import (
|
|
40
41
|
FleetModel,
|
|
@@ -195,6 +196,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
195
196
|
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
196
197
|
)
|
|
197
198
|
.options(lazyload(InstanceModel.jobs))
|
|
199
|
+
.order_by(InstanceModel.id) # take locks in order
|
|
198
200
|
.with_for_update()
|
|
199
201
|
)
|
|
200
202
|
pool_instances = list(res.unique().scalars().all())
|
|
@@ -319,6 +321,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
319
321
|
select(VolumeModel)
|
|
320
322
|
.where(VolumeModel.id.in_(volumes_ids))
|
|
321
323
|
.options(selectinload(VolumeModel.user))
|
|
324
|
+
.order_by(VolumeModel.id) # take locks in order
|
|
322
325
|
.with_for_update()
|
|
323
326
|
)
|
|
324
327
|
async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
@@ -450,7 +453,7 @@ async def _run_job_on_new_instance(
|
|
|
450
453
|
)
|
|
451
454
|
# Limit number of offers tried to prevent long-running processing
|
|
452
455
|
# in case all offers fail.
|
|
453
|
-
for backend, offer in offers[:
|
|
456
|
+
for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
|
|
454
457
|
logger.debug(
|
|
455
458
|
"%s: trying %s in %s/%s for $%0.4f per hour",
|
|
456
459
|
fmt(job_model),
|