dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +11 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +23 -10
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +15 -9
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +26 -20
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +32 -7
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -2
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +11 -18
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +24 -4
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/__init__.py +12 -9
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/pools.py +16 -17
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +6 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +50 -8
- dstack/api/_public/runs.py +4 -1
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +2 -2
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +81 -58
- tests/_internal/server/services/test_logs.py +3 -3
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
|
|
|
45
45
|
InstanceOfferWithAvailability,
|
|
46
46
|
InstanceRuntime,
|
|
47
47
|
InstanceStatus,
|
|
48
|
-
InstanceType,
|
|
49
48
|
RemoteConnectionInfo,
|
|
50
49
|
SSHKey,
|
|
51
50
|
)
|
|
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
|
|
|
63
62
|
Retry,
|
|
64
63
|
)
|
|
65
64
|
from dstack._internal.core.services.profiles import get_retry
|
|
65
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
66
66
|
from dstack._internal.server.db import get_session_ctx
|
|
67
67
|
from dstack._internal.server.models import (
|
|
68
68
|
FleetModel,
|
|
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
|
|
|
695
695
|
|
|
696
696
|
if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
|
|
697
697
|
provisioning_deadline = _get_provisioning_deadline(
|
|
698
|
-
instance,
|
|
698
|
+
instance=instance,
|
|
699
|
+
job_provisioning_data=job_provisioning_data,
|
|
699
700
|
)
|
|
700
701
|
if get_current_datetime() > provisioning_deadline:
|
|
701
702
|
instance.status = InstanceStatus.TERMINATING
|
|
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
|
|
|
737
738
|
instance.name,
|
|
738
739
|
)
|
|
739
740
|
provisioning_deadline = _get_provisioning_deadline(
|
|
740
|
-
instance,
|
|
741
|
+
instance=instance,
|
|
742
|
+
job_provisioning_data=job_provisioning_data,
|
|
741
743
|
)
|
|
742
744
|
if get_current_datetime() > provisioning_deadline:
|
|
743
745
|
logger.warning(
|
|
@@ -959,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
|
|
|
959
961
|
|
|
960
962
|
|
|
961
963
|
def _get_provisioning_deadline(
|
|
962
|
-
instance: InstanceModel,
|
|
964
|
+
instance: InstanceModel,
|
|
965
|
+
job_provisioning_data: JobProvisioningData,
|
|
963
966
|
) -> datetime.datetime:
|
|
964
|
-
timeout_interval =
|
|
967
|
+
timeout_interval = get_provisioning_timeout(
|
|
968
|
+
backend_type=job_provisioning_data.get_base_backend(),
|
|
969
|
+
instance_type_name=job_provisioning_data.instance_type.name,
|
|
970
|
+
)
|
|
965
971
|
return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
|
|
966
972
|
|
|
967
973
|
|
|
968
|
-
def _get_instance_timeout_interval(
|
|
969
|
-
backend_type: BackendType, instance_type_name: str
|
|
970
|
-
) -> timedelta:
|
|
971
|
-
# when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
|
|
972
|
-
if backend_type == BackendType.RUNPOD:
|
|
973
|
-
return timedelta(seconds=1200)
|
|
974
|
-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
975
|
-
return timedelta(seconds=1200)
|
|
976
|
-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
977
|
-
return timedelta(seconds=3300)
|
|
978
|
-
return timedelta(seconds=600)
|
|
979
|
-
|
|
980
|
-
|
|
981
974
|
def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
|
|
982
975
|
return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
|
|
@@ -28,6 +28,7 @@ async def process_placement_groups():
|
|
|
28
28
|
PlacementGroupModel.deleted == False,
|
|
29
29
|
PlacementGroupModel.id.not_in(lockset),
|
|
30
30
|
)
|
|
31
|
+
.order_by(PlacementGroupModel.id) # take locks in order
|
|
31
32
|
.with_for_update(skip_locked=True)
|
|
32
33
|
)
|
|
33
34
|
placement_group_models = res.scalars().all()
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import sqlalchemy.exc
|
|
6
|
+
from sqlalchemy import delete, or_, select, update
|
|
7
|
+
from sqlalchemy.orm import joinedload
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
10
|
+
from dstack._internal.core.models.runs import JobStatus
|
|
11
|
+
from dstack._internal.server.db import get_session_ctx
|
|
12
|
+
from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
|
|
13
|
+
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
14
|
+
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
15
|
+
from dstack._internal.server.services.runner import client
|
|
16
|
+
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
17
|
+
from dstack._internal.server.utils.common import gather_map_async
|
|
18
|
+
from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
|
|
19
|
+
from dstack._internal.utils.logging import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
MAX_JOBS_FETCHED = 100
|
|
25
|
+
BATCH_SIZE = 10
|
|
26
|
+
MIN_COLLECT_INTERVAL_SECONDS = 9
|
|
27
|
+
# 10 minutes should be more than enough to scrape metrics, and, in any case,
|
|
28
|
+
# 10 minutes old metrics has little to no value
|
|
29
|
+
METRICS_TTL_SECONDS = 600
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def collect_prometheus_metrics():
|
|
33
|
+
now = get_current_datetime()
|
|
34
|
+
cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
|
|
35
|
+
async with get_session_ctx() as session:
|
|
36
|
+
res = await session.execute(
|
|
37
|
+
select(JobModel)
|
|
38
|
+
.join(JobPrometheusMetrics, isouter=True)
|
|
39
|
+
.where(
|
|
40
|
+
JobModel.status.in_([JobStatus.RUNNING]),
|
|
41
|
+
or_(
|
|
42
|
+
JobPrometheusMetrics.job_id.is_(None),
|
|
43
|
+
JobPrometheusMetrics.collected_at < cutoff,
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
.options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
|
|
47
|
+
.order_by(JobModel.last_processed_at.asc())
|
|
48
|
+
.limit(MAX_JOBS_FETCHED)
|
|
49
|
+
)
|
|
50
|
+
job_models = res.unique().scalars().all()
|
|
51
|
+
for batch in batched(job_models, BATCH_SIZE):
|
|
52
|
+
await _collect_jobs_metrics(batch, now)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def delete_prometheus_metrics():
|
|
56
|
+
now = get_current_datetime()
|
|
57
|
+
cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
|
|
58
|
+
async with get_session_ctx() as session:
|
|
59
|
+
await session.execute(
|
|
60
|
+
delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
|
|
61
|
+
)
|
|
62
|
+
await session.commit()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
|
|
66
|
+
results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
|
|
67
|
+
async with get_session_ctx() as session:
|
|
68
|
+
for job_model, result in results:
|
|
69
|
+
if result is None:
|
|
70
|
+
continue
|
|
71
|
+
if isinstance(result, BaseException):
|
|
72
|
+
logger.error(
|
|
73
|
+
"Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
|
|
74
|
+
)
|
|
75
|
+
continue
|
|
76
|
+
res = await session.execute(
|
|
77
|
+
update(JobPrometheusMetrics)
|
|
78
|
+
.where(JobPrometheusMetrics.job_id == job_model.id)
|
|
79
|
+
.values(
|
|
80
|
+
collected_at=collected_at,
|
|
81
|
+
text=result,
|
|
82
|
+
)
|
|
83
|
+
.returning(JobPrometheusMetrics)
|
|
84
|
+
)
|
|
85
|
+
metrics = res.scalar()
|
|
86
|
+
if metrics is None:
|
|
87
|
+
metrics = JobPrometheusMetrics(
|
|
88
|
+
job_id=job_model.id,
|
|
89
|
+
collected_at=collected_at,
|
|
90
|
+
text=result,
|
|
91
|
+
)
|
|
92
|
+
try:
|
|
93
|
+
async with session.begin_nested():
|
|
94
|
+
session.add(metrics)
|
|
95
|
+
except sqlalchemy.exc.IntegrityError:
|
|
96
|
+
# Concurrent server replica already committed, ignoring
|
|
97
|
+
pass
|
|
98
|
+
await session.commit()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
|
|
102
|
+
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
103
|
+
jpd = get_job_provisioning_data(job_model)
|
|
104
|
+
jrd = get_job_runtime_data(job_model)
|
|
105
|
+
if jpd is None:
|
|
106
|
+
return None
|
|
107
|
+
try:
|
|
108
|
+
res = await run_async(
|
|
109
|
+
_pull_job_metrics,
|
|
110
|
+
ssh_private_keys,
|
|
111
|
+
jpd,
|
|
112
|
+
jrd,
|
|
113
|
+
job_model.id,
|
|
114
|
+
)
|
|
115
|
+
except Exception:
|
|
116
|
+
logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
if isinstance(res, bool):
|
|
120
|
+
logger.warning(
|
|
121
|
+
"Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
|
|
122
|
+
)
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
if res is None:
|
|
126
|
+
# Either not supported by shim or exporter is not available
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
return res
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
|
|
133
|
+
def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
|
|
134
|
+
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
135
|
+
return shim_client.get_task_metrics(task_id)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from collections.abc import Iterable
|
|
2
3
|
from datetime import timedelta
|
|
3
4
|
from typing import Dict, List, Optional
|
|
4
5
|
|
|
@@ -16,11 +17,13 @@ from dstack._internal.core.models.instances import (
|
|
|
16
17
|
RemoteConnectionInfo,
|
|
17
18
|
SSHConnectionParams,
|
|
18
19
|
)
|
|
20
|
+
from dstack._internal.core.models.metrics import Metric
|
|
19
21
|
from dstack._internal.core.models.repos import RemoteRepoCreds
|
|
20
22
|
from dstack._internal.core.models.runs import (
|
|
21
23
|
ClusterInfo,
|
|
22
24
|
Job,
|
|
23
25
|
JobProvisioningData,
|
|
26
|
+
JobRuntimeData,
|
|
24
27
|
JobSpec,
|
|
25
28
|
JobStatus,
|
|
26
29
|
JobTerminationReason,
|
|
@@ -28,6 +31,7 @@ from dstack._internal.core.models.runs import (
|
|
|
28
31
|
RunSpec,
|
|
29
32
|
)
|
|
30
33
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
34
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
31
35
|
from dstack._internal.server.db import get_session_ctx
|
|
32
36
|
from dstack._internal.server.models import (
|
|
33
37
|
InstanceModel,
|
|
@@ -47,6 +51,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
47
51
|
)
|
|
48
52
|
from dstack._internal.server.services.locking import get_locker
|
|
49
53
|
from dstack._internal.server.services.logging import fmt
|
|
54
|
+
from dstack._internal.server.services.metrics import get_job_metrics
|
|
50
55
|
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
51
56
|
from dstack._internal.server.services.repos import (
|
|
52
57
|
get_code_model,
|
|
@@ -148,6 +153,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
148
153
|
jobs=run.jobs,
|
|
149
154
|
replica_num=job.job_spec.replica_num,
|
|
150
155
|
job_provisioning_data=job_provisioning_data,
|
|
156
|
+
job_runtime_data=job_submission.job_runtime_data,
|
|
151
157
|
)
|
|
152
158
|
|
|
153
159
|
volumes = await get_job_attached_volumes(
|
|
@@ -242,7 +248,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
242
248
|
|
|
243
249
|
if not success:
|
|
244
250
|
# check timeout
|
|
245
|
-
if job_submission.age >
|
|
251
|
+
if job_submission.age > get_provisioning_timeout(
|
|
246
252
|
backend_type=job_provisioning_data.get_base_backend(),
|
|
247
253
|
instance_type_name=job_provisioning_data.instance_type.name,
|
|
248
254
|
):
|
|
@@ -341,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
341
347
|
job_model.status = JobStatus.TERMINATING
|
|
342
348
|
job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
|
|
343
349
|
|
|
350
|
+
if job_model.status == JobStatus.RUNNING:
|
|
351
|
+
await _check_gpu_utilization(session, job_model, job)
|
|
352
|
+
|
|
344
353
|
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
345
354
|
await session.commit()
|
|
346
355
|
|
|
@@ -644,33 +653,74 @@ def _terminate_if_inactivity_duration_exceeded(
|
|
|
644
653
|
run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
|
|
645
654
|
) -> None:
|
|
646
655
|
conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
|
|
647
|
-
if is_core_model_instance(conf, DevEnvironmentConfiguration)
|
|
656
|
+
if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
|
|
648
657
|
conf.inactivity_duration, int
|
|
649
658
|
):
|
|
650
|
-
|
|
651
|
-
job_model.inactivity_secs =
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
659
|
+
# reset in case inactivity_duration was disabled via in-place update
|
|
660
|
+
job_model.inactivity_secs = None
|
|
661
|
+
return
|
|
662
|
+
logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
|
|
663
|
+
job_model.inactivity_secs = no_connections_secs
|
|
664
|
+
if no_connections_secs is None:
|
|
665
|
+
# TODO(0.19 or earlier): make no_connections_secs required
|
|
666
|
+
job_model.status = JobStatus.TERMINATING
|
|
667
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
668
|
+
job_model.termination_reason_message = (
|
|
669
|
+
"The selected instance was created before dstack 0.18.41"
|
|
670
|
+
" and does not support inactivity_duration"
|
|
671
|
+
)
|
|
672
|
+
elif no_connections_secs >= conf.inactivity_duration:
|
|
673
|
+
job_model.status = JobStatus.TERMINATING
|
|
674
|
+
# TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
|
|
675
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
676
|
+
job_model.termination_reason_message = (
|
|
677
|
+
f"The job was inactive for {no_connections_secs} seconds,"
|
|
678
|
+
f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
|
|
683
|
+
policy = job.job_spec.utilization_policy
|
|
684
|
+
if policy is None:
|
|
685
|
+
return
|
|
686
|
+
after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
|
|
687
|
+
job_metrics = await get_job_metrics(session, job_model, after=after)
|
|
688
|
+
gpus_util_metrics: list[Metric] = []
|
|
689
|
+
for metric in job_metrics.metrics:
|
|
690
|
+
if metric.name.startswith("gpu_util_percent_gpu"):
|
|
691
|
+
gpus_util_metrics.append(metric)
|
|
692
|
+
if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
|
|
693
|
+
# Job has started recently, not enough points collected.
|
|
694
|
+
# Assuming that metrics collection interval less than 1 minute.
|
|
695
|
+
logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
|
|
696
|
+
return
|
|
697
|
+
if _should_terminate_due_to_low_gpu_util(
|
|
698
|
+
policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
|
|
699
|
+
):
|
|
700
|
+
logger.info("%s: GPU utilization check: terminating", fmt(job_model))
|
|
701
|
+
job_model.status = JobStatus.TERMINATING
|
|
702
|
+
# TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
|
|
703
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
704
|
+
job_model.termination_reason_message = (
|
|
705
|
+
f"The job GPU utilization below {policy.min_gpu_utilization}%"
|
|
706
|
+
f" for {policy.time_window} seconds"
|
|
707
|
+
)
|
|
708
|
+
else:
|
|
709
|
+
logger.debug("%s: GPU utilization check: OK", fmt(job_model))
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
|
|
713
|
+
for gpu_util in gpus_util:
|
|
714
|
+
if all(util < min_util for util in gpu_util):
|
|
715
|
+
return True
|
|
716
|
+
return False
|
|
668
717
|
|
|
669
718
|
|
|
670
719
|
def _get_cluster_info(
|
|
671
720
|
jobs: List[Job],
|
|
672
721
|
replica_num: int,
|
|
673
722
|
job_provisioning_data: JobProvisioningData,
|
|
723
|
+
job_runtime_data: Optional[JobRuntimeData],
|
|
674
724
|
) -> ClusterInfo:
|
|
675
725
|
job_ips = []
|
|
676
726
|
for job in jobs:
|
|
@@ -681,10 +731,13 @@ def _get_cluster_info(
|
|
|
681
731
|
).internal_ip
|
|
682
732
|
or ""
|
|
683
733
|
)
|
|
734
|
+
gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
|
|
735
|
+
if job_runtime_data is not None and job_runtime_data.offer is not None:
|
|
736
|
+
gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
|
|
684
737
|
cluster_info = ClusterInfo(
|
|
685
738
|
job_ips=job_ips,
|
|
686
739
|
master_job_ip=job_ips[0],
|
|
687
|
-
gpus_per_job=
|
|
740
|
+
gpus_per_job=gpus_per_job,
|
|
688
741
|
)
|
|
689
742
|
return cluster_info
|
|
690
743
|
|
|
@@ -763,16 +816,3 @@ def _submit_job_to_runner(
|
|
|
763
816
|
# do not log here, because the runner will send a new status
|
|
764
817
|
|
|
765
818
|
return True
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
|
|
769
|
-
# when changing timeouts, also consider process_instances._get_instance_timeout_interval
|
|
770
|
-
if backend_type == BackendType.LAMBDA:
|
|
771
|
-
return timedelta(seconds=1200)
|
|
772
|
-
if backend_type == BackendType.KUBERNETES:
|
|
773
|
-
return timedelta(seconds=1200)
|
|
774
|
-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
775
|
-
return timedelta(seconds=1200)
|
|
776
|
-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
777
|
-
return timedelta(seconds=3300)
|
|
778
|
-
return timedelta(seconds=600)
|
|
@@ -35,6 +35,7 @@ from dstack._internal.core.models.runs import (
|
|
|
35
35
|
)
|
|
36
36
|
from dstack._internal.core.models.volumes import Volume
|
|
37
37
|
from dstack._internal.core.services.profiles import get_termination
|
|
38
|
+
from dstack._internal.server import settings
|
|
38
39
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
39
40
|
from dstack._internal.server.models import (
|
|
40
41
|
FleetModel,
|
|
@@ -195,6 +196,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
195
196
|
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
196
197
|
)
|
|
197
198
|
.options(lazyload(InstanceModel.jobs))
|
|
199
|
+
.order_by(InstanceModel.id) # take locks in order
|
|
198
200
|
.with_for_update()
|
|
199
201
|
)
|
|
200
202
|
pool_instances = list(res.unique().scalars().all())
|
|
@@ -319,6 +321,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
319
321
|
select(VolumeModel)
|
|
320
322
|
.where(VolumeModel.id.in_(volumes_ids))
|
|
321
323
|
.options(selectinload(VolumeModel.user))
|
|
324
|
+
.order_by(VolumeModel.id) # take locks in order
|
|
322
325
|
.with_for_update()
|
|
323
326
|
)
|
|
324
327
|
async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
@@ -450,7 +453,7 @@ async def _run_job_on_new_instance(
|
|
|
450
453
|
)
|
|
451
454
|
# Limit number of offers tried to prevent long-running processing
|
|
452
455
|
# in case all offers fail.
|
|
453
|
-
for backend, offer in offers[:
|
|
456
|
+
for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
|
|
454
457
|
logger.debug(
|
|
455
458
|
"%s: trying %s in %s/%s for $%0.4f per hour",
|
|
456
459
|
fmt(job_model),
|
|
@@ -11,7 +11,6 @@ from dstack._internal.server.models import (
|
|
|
11
11
|
JobModel,
|
|
12
12
|
ProjectModel,
|
|
13
13
|
VolumeAttachmentModel,
|
|
14
|
-
VolumeModel,
|
|
15
14
|
)
|
|
16
15
|
from dstack._internal.server.services.jobs import (
|
|
17
16
|
process_terminating_job,
|
|
@@ -86,12 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
86
85
|
.where(InstanceModel.id == job_model.used_instance_id)
|
|
87
86
|
.options(
|
|
88
87
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
89
|
-
joinedload(InstanceModel.volume_attachments)
|
|
90
|
-
.joinedload(VolumeAttachmentModel.volume)
|
|
91
|
-
.joinedload(VolumeModel.user),
|
|
92
|
-
joinedload(InstanceModel.volume_attachments)
|
|
93
|
-
.joinedload(VolumeAttachmentModel.volume)
|
|
94
|
-
.joinedload(VolumeModel.attachments),
|
|
88
|
+
joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
|
|
95
89
|
)
|
|
96
90
|
)
|
|
97
91
|
instance_model = res.unique().scalar()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Add JobPrometheusMetrics
|
|
2
|
+
|
|
3
|
+
Revision ID: 60e444118b6d
|
|
4
|
+
Revises: a751ef183f27
|
|
5
|
+
Create Date: 2025-02-21 10:59:26.339353
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "60e444118b6d"
|
|
17
|
+
down_revision = "a751ef183f27"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
op.create_table(
|
|
25
|
+
"job_prometheus_metrics",
|
|
26
|
+
sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
27
|
+
sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
28
|
+
sa.Column("text", sa.Text(), nullable=False),
|
|
29
|
+
sa.ForeignKeyConstraint(
|
|
30
|
+
["job_id"], ["jobs.id"], name=op.f("fk_job_prometheus_metrics_job_id_jobs")
|
|
31
|
+
),
|
|
32
|
+
sa.PrimaryKeyConstraint("job_id", name=op.f("pk_job_prometheus_metrics")),
|
|
33
|
+
)
|
|
34
|
+
# ### end Alembic commands ###
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def downgrade() -> None:
|
|
38
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
39
|
+
op.drop_table("job_prometheus_metrics")
|
|
40
|
+
# ### end Alembic commands ###
|
dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Add JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
|
|
2
|
+
|
|
3
|
+
Revision ID: 98d1b92988bc
|
|
4
|
+
Revises: 60e444118b6d
|
|
5
|
+
Create Date: 2025-02-28 15:12:37.649876
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
from alembic_postgresql_enum import TableReference
|
|
12
|
+
|
|
13
|
+
# revision identifiers, used by Alembic.
|
|
14
|
+
revision = "98d1b92988bc"
|
|
15
|
+
down_revision = "60e444118b6d"
|
|
16
|
+
branch_labels = None
|
|
17
|
+
depends_on = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade() -> None:
|
|
21
|
+
# SQLite
|
|
22
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
23
|
+
batch_op.alter_column(
|
|
24
|
+
"termination_reason",
|
|
25
|
+
existing_type=sa.VARCHAR(length=34),
|
|
26
|
+
type_=sa.Enum(
|
|
27
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
28
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
29
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
30
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
31
|
+
"TERMINATED_BY_USER",
|
|
32
|
+
"VOLUME_ERROR",
|
|
33
|
+
"GATEWAY_ERROR",
|
|
34
|
+
"SCALED_DOWN",
|
|
35
|
+
"DONE_BY_RUNNER",
|
|
36
|
+
"ABORTED_BY_USER",
|
|
37
|
+
"TERMINATED_BY_SERVER",
|
|
38
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
39
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
40
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
41
|
+
"PORTS_BINDING_FAILED",
|
|
42
|
+
"CREATING_CONTAINER_ERROR",
|
|
43
|
+
"EXECUTOR_ERROR",
|
|
44
|
+
"MAX_DURATION_EXCEEDED",
|
|
45
|
+
name="jobterminationreason",
|
|
46
|
+
),
|
|
47
|
+
existing_nullable=True,
|
|
48
|
+
)
|
|
49
|
+
# PostgreSQL
|
|
50
|
+
op.sync_enum_values(
|
|
51
|
+
enum_schema="public",
|
|
52
|
+
enum_name="jobterminationreason",
|
|
53
|
+
new_values=[
|
|
54
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
55
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
56
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
57
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
58
|
+
"TERMINATED_BY_USER",
|
|
59
|
+
"VOLUME_ERROR",
|
|
60
|
+
"GATEWAY_ERROR",
|
|
61
|
+
"SCALED_DOWN",
|
|
62
|
+
"DONE_BY_RUNNER",
|
|
63
|
+
"ABORTED_BY_USER",
|
|
64
|
+
"TERMINATED_BY_SERVER",
|
|
65
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
66
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
67
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
68
|
+
"PORTS_BINDING_FAILED",
|
|
69
|
+
"CREATING_CONTAINER_ERROR",
|
|
70
|
+
"EXECUTOR_ERROR",
|
|
71
|
+
"MAX_DURATION_EXCEEDED",
|
|
72
|
+
],
|
|
73
|
+
affected_columns=[
|
|
74
|
+
TableReference(
|
|
75
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
76
|
+
)
|
|
77
|
+
],
|
|
78
|
+
enum_values_to_rename=[],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def downgrade() -> None:
|
|
83
|
+
# SQLite
|
|
84
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
85
|
+
batch_op.alter_column(
|
|
86
|
+
"termination_reason",
|
|
87
|
+
existing_type=sa.Enum(
|
|
88
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
89
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
90
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
91
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
92
|
+
"TERMINATED_BY_USER",
|
|
93
|
+
"VOLUME_ERROR",
|
|
94
|
+
"GATEWAY_ERROR",
|
|
95
|
+
"SCALED_DOWN",
|
|
96
|
+
"DONE_BY_RUNNER",
|
|
97
|
+
"ABORTED_BY_USER",
|
|
98
|
+
"TERMINATED_BY_SERVER",
|
|
99
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
100
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
101
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
102
|
+
"PORTS_BINDING_FAILED",
|
|
103
|
+
"CREATING_CONTAINER_ERROR",
|
|
104
|
+
"EXECUTOR_ERROR",
|
|
105
|
+
"MAX_DURATION_EXCEEDED",
|
|
106
|
+
name="jobterminationreason",
|
|
107
|
+
),
|
|
108
|
+
type_=sa.VARCHAR(length=34),
|
|
109
|
+
existing_nullable=True,
|
|
110
|
+
)
|
|
111
|
+
# PostgreSQL
|
|
112
|
+
op.sync_enum_values(
|
|
113
|
+
enum_schema="public",
|
|
114
|
+
enum_name="jobterminationreason",
|
|
115
|
+
new_values=[
|
|
116
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
117
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
118
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
119
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
120
|
+
"TERMINATED_BY_USER",
|
|
121
|
+
"VOLUME_ERROR",
|
|
122
|
+
"GATEWAY_ERROR",
|
|
123
|
+
"SCALED_DOWN",
|
|
124
|
+
"DONE_BY_RUNNER",
|
|
125
|
+
"ABORTED_BY_USER",
|
|
126
|
+
"TERMINATED_BY_SERVER",
|
|
127
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
128
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
129
|
+
"PORTS_BINDING_FAILED",
|
|
130
|
+
"CREATING_CONTAINER_ERROR",
|
|
131
|
+
"EXECUTOR_ERROR",
|
|
132
|
+
"MAX_DURATION_EXCEEDED",
|
|
133
|
+
],
|
|
134
|
+
affected_columns=[
|
|
135
|
+
TableReference(
|
|
136
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
137
|
+
)
|
|
138
|
+
],
|
|
139
|
+
enum_values_to_rename=[],
|
|
140
|
+
)
|
|
@@ -648,3 +648,14 @@ class JobMetricsPoint(BaseModel):
|
|
|
648
648
|
# json-encoded lists of metric values of len(gpus) length
|
|
649
649
|
gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text)
|
|
650
650
|
gpus_util_percent: Mapped[str] = mapped_column(Text)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
class JobPrometheusMetrics(BaseModel):
|
|
654
|
+
__tablename__ = "job_prometheus_metrics"
|
|
655
|
+
|
|
656
|
+
job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
|
|
657
|
+
job: Mapped["JobModel"] = relationship()
|
|
658
|
+
|
|
659
|
+
collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
660
|
+
# Raw Prometheus text response
|
|
661
|
+
text: Mapped[str] = mapped_column(Text)
|
|
@@ -24,4 +24,7 @@ async def poll_logs(
|
|
|
24
24
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
25
25
|
) -> JobSubmissionLogs:
|
|
26
26
|
_, project = user_project
|
|
27
|
+
# The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
|
|
28
|
+
# Otherwise, some logs with duplicated timestamps may be filtered out.
|
|
29
|
+
# This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
|
|
27
30
|
return await logs.poll_logs_async(project=project, request=body)
|