dstack 0.18.43__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/gateway.py +15 -3
- dstack/_internal/cli/commands/logs.py +0 -22
- dstack/_internal/cli/commands/stats.py +8 -17
- dstack/_internal/cli/main.py +1 -5
- dstack/_internal/cli/services/configurators/fleet.py +4 -39
- dstack/_internal/cli/services/configurators/run.py +22 -20
- dstack/_internal/cli/services/profile.py +34 -83
- dstack/_internal/cli/utils/gateway.py +1 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/__init__.py +56 -39
- dstack/_internal/core/backends/aws/__init__.py +0 -25
- dstack/_internal/core/backends/aws/auth.py +1 -10
- dstack/_internal/core/backends/aws/backend.py +26 -0
- dstack/_internal/core/backends/aws/compute.py +21 -45
- dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +1 -1
- dstack/_internal/core/backends/azure/__init__.py +0 -20
- dstack/_internal/core/backends/azure/auth.py +2 -11
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +14 -28
- dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
- dstack/_internal/core/backends/azure/models.py +89 -0
- dstack/_internal/core/backends/base/__init__.py +0 -12
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +153 -33
- dstack/_internal/core/backends/base/configurator.py +105 -0
- dstack/_internal/core/backends/base/models.py +14 -0
- dstack/_internal/core/backends/configurators.py +138 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -15
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +8 -26
- dstack/_internal/core/backends/cudo/configurator.py +72 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
- dstack/_internal/core/backends/datacrunch/backend.py +16 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -25
- dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
- dstack/_internal/core/backends/datacrunch/models.py +38 -0
- dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
- dstack/_internal/core/backends/gcp/__init__.py +0 -16
- dstack/_internal/core/backends/gcp/auth.py +2 -11
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +14 -44
- dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
- dstack/_internal/core/backends/gcp/models.py +125 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +16 -5
- dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
- dstack/_internal/core/backends/kubernetes/models.py +72 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
- dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -13
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +16 -2
- dstack/_internal/core/backends/models.py +128 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -15
- dstack/_internal/core/backends/oci/auth.py +1 -5
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +9 -23
- dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
- dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
- dstack/_internal/core/backends/oci/region.py +1 -1
- dstack/_internal/core/backends/runpod/__init__.py +0 -15
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +28 -6
- dstack/_internal/core/backends/runpod/configurator.py +59 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/__init__.py +0 -15
- dstack/_internal/core/backends/tensordock/backend.py +16 -0
- dstack/_internal/core/backends/tensordock/compute.py +8 -27
- dstack/_internal/core/backends/tensordock/configurator.py +68 -0
- dstack/_internal/core/backends/tensordock/models.py +38 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -15
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +2 -2
- dstack/_internal/core/backends/vastai/configurator.py +66 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -15
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +10 -24
- dstack/_internal/core/backends/vultr/configurator.py +64 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/models/backends/__init__.py +0 -184
- dstack/_internal/core/models/backends/base.py +0 -19
- dstack/_internal/core/models/configurations.py +22 -16
- dstack/_internal/core/models/envs.py +4 -3
- dstack/_internal/core/models/fleets.py +17 -22
- dstack/_internal/core/models/gateways.py +3 -3
- dstack/_internal/core/models/instances.py +24 -0
- dstack/_internal/core/models/profiles.py +85 -45
- dstack/_internal/core/models/projects.py +1 -1
- dstack/_internal/core/models/repos/base.py +0 -5
- dstack/_internal/core/models/repos/local.py +3 -3
- dstack/_internal/core/models/repos/remote.py +26 -12
- dstack/_internal/core/models/repos/virtual.py +1 -1
- dstack/_internal/core/models/resources.py +45 -76
- dstack/_internal/core/models/runs.py +21 -19
- dstack/_internal/core/models/volumes.py +1 -3
- dstack/_internal/core/services/profiles.py +7 -16
- dstack/_internal/core/services/repos.py +0 -4
- dstack/_internal/server/app.py +11 -4
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_gateways.py +4 -8
- dstack/_internal/server/background/tasks/process_instances.py +14 -9
- dstack/_internal/server/background/tasks/process_metrics.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
- dstack/_internal/server/background/tasks/process_volumes.py +5 -2
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/models.py +59 -9
- dstack/_internal/server/routers/backends.py +14 -23
- dstack/_internal/server/routers/instances.py +3 -4
- dstack/_internal/server/routers/metrics.py +31 -10
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/routers/repos.py +1 -2
- dstack/_internal/server/routers/runs.py +13 -59
- dstack/_internal/server/schemas/gateways.py +14 -23
- dstack/_internal/server/schemas/projects.py +7 -2
- dstack/_internal/server/schemas/repos.py +2 -38
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/schemas/runs.py +1 -24
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/__init__.py +85 -158
- dstack/_internal/server/services/config.py +53 -567
- dstack/_internal/server/services/fleets.py +9 -103
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/{pools.py → instances.py} +22 -329
- dstack/_internal/server/services/jobs/__init__.py +9 -6
- dstack/_internal/server/services/jobs/configurators/base.py +25 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/metrics.py +131 -72
- dstack/_internal/server/services/offers.py +1 -1
- dstack/_internal/server/services/projects.py +23 -14
- dstack/_internal/server/services/prometheus.py +245 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +67 -31
- dstack/_internal/server/services/volumes.py +9 -4
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4a0fe83e84574654e397.js} +76 -19
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +75 -32
- dstack/_internal/utils/json_schema.py +6 -0
- dstack/_internal/utils/ssh.py +2 -1
- dstack/api/__init__.py +4 -0
- dstack/api/_public/__init__.py +16 -20
- dstack/api/_public/backends.py +1 -1
- dstack/api/_public/repos.py +36 -36
- dstack/api/_public/runs.py +170 -83
- dstack/api/server/__init__.py +11 -13
- dstack/api/server/_backends.py +12 -16
- dstack/api/server/_fleets.py +15 -55
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_repos.py +1 -4
- dstack/api/server/_runs.py +21 -96
- dstack/api/server/_volumes.py +10 -5
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/RECORD +229 -206
- tests/_internal/cli/services/configurators/test_profile.py +6 -6
- tests/_internal/core/backends/aws/test_configurator.py +35 -0
- tests/_internal/core/backends/aws/test_resources.py +1 -1
- tests/_internal/core/backends/azure/test_configurator.py +61 -0
- tests/_internal/core/backends/cudo/__init__.py +0 -0
- tests/_internal/core/backends/cudo/test_configurator.py +37 -0
- tests/_internal/core/backends/datacrunch/__init__.py +0 -0
- tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
- tests/_internal/core/backends/gcp/test_configurator.py +42 -0
- tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
- tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
- tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
- tests/_internal/core/backends/oci/test_configurator.py +55 -0
- tests/_internal/core/backends/runpod/__init__.py +0 -0
- tests/_internal/core/backends/runpod/test_configurator.py +33 -0
- tests/_internal/core/backends/tensordock/__init__.py +0 -0
- tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
- tests/_internal/core/backends/vastai/__init__.py +0 -0
- tests/_internal/core/backends/vastai/test_configurator.py +33 -0
- tests/_internal/core/backends/vultr/__init__.py +0 -0
- tests/_internal/core/backends/vultr/test_configurator.py +33 -0
- tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
- tests/_internal/server/background/tasks/test_process_instances.py +49 -48
- tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
- tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
- tests/_internal/server/background/tasks/test_process_runs.py +8 -22
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
- tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
- tests/_internal/server/routers/test_backends.py +6 -764
- tests/_internal/server/routers/test_fleets.py +2 -26
- tests/_internal/server/routers/test_gateways.py +27 -3
- tests/_internal/server/routers/test_instances.py +0 -10
- tests/_internal/server/routers/test_metrics.py +42 -0
- tests/_internal/server/routers/test_projects.py +56 -0
- tests/_internal/server/routers/test_prometheus.py +333 -0
- tests/_internal/server/routers/test_repos.py +0 -15
- tests/_internal/server/routers/test_runs.py +83 -275
- tests/_internal/server/routers/test_volumes.py +2 -3
- tests/_internal/server/services/backends/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
- tests/_internal/server/services/test_config.py +7 -4
- tests/_internal/server/services/test_fleets.py +1 -4
- tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
- tests/_internal/server/services/test_metrics.py +167 -0
- tests/_internal/server/services/test_repos.py +1 -14
- tests/_internal/server/services/test_runs.py +0 -4
- dstack/_internal/cli/commands/pool.py +0 -581
- dstack/_internal/cli/commands/run.py +0 -75
- dstack/_internal/core/backends/aws/config.py +0 -18
- dstack/_internal/core/backends/azure/config.py +0 -12
- dstack/_internal/core/backends/base/config.py +0 -5
- dstack/_internal/core/backends/cudo/config.py +0 -9
- dstack/_internal/core/backends/datacrunch/config.py +0 -9
- dstack/_internal/core/backends/gcp/config.py +0 -22
- dstack/_internal/core/backends/kubernetes/config.py +0 -6
- dstack/_internal/core/backends/lambdalabs/config.py +0 -9
- dstack/_internal/core/backends/nebius/__init__.py +0 -15
- dstack/_internal/core/backends/nebius/api_client.py +0 -319
- dstack/_internal/core/backends/nebius/compute.py +0 -220
- dstack/_internal/core/backends/nebius/config.py +0 -6
- dstack/_internal/core/backends/nebius/types.py +0 -37
- dstack/_internal/core/backends/oci/config.py +0 -6
- dstack/_internal/core/backends/runpod/config.py +0 -9
- dstack/_internal/core/backends/tensordock/config.py +0 -9
- dstack/_internal/core/backends/vastai/config.py +0 -6
- dstack/_internal/core/backends/vultr/config.py +0 -9
- dstack/_internal/core/models/backends/aws.py +0 -86
- dstack/_internal/core/models/backends/azure.py +0 -68
- dstack/_internal/core/models/backends/cudo.py +0 -43
- dstack/_internal/core/models/backends/datacrunch.py +0 -44
- dstack/_internal/core/models/backends/gcp.py +0 -67
- dstack/_internal/core/models/backends/kubernetes.py +0 -40
- dstack/_internal/core/models/backends/lambdalabs.py +0 -43
- dstack/_internal/core/models/backends/nebius.py +0 -54
- dstack/_internal/core/models/backends/runpod.py +0 -40
- dstack/_internal/core/models/backends/tensordock.py +0 -44
- dstack/_internal/core/models/backends/vastai.py +0 -43
- dstack/_internal/core/models/backends/vultr.py +0 -40
- dstack/_internal/core/models/pools.py +0 -43
- dstack/_internal/server/routers/pools.py +0 -142
- dstack/_internal/server/schemas/pools.py +0 -38
- dstack/_internal/server/services/backends/configurators/base.py +0 -72
- dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
- dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
- dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
- dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
- dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
- dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
- dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
- dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
- dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
- dstack/api/_public/pools.py +0 -41
- dstack/api/_public/resources.py +0 -105
- dstack/api/server/_pools.py +0 -63
- tests/_internal/server/routers/test_pools.py +0 -612
- /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/top_level.txt +0 -0
|
@@ -1,113 +1,172 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from collections.abc import Sequence
|
|
2
4
|
from datetime import datetime, timezone
|
|
5
|
+
from typing import Optional
|
|
3
6
|
|
|
4
7
|
from sqlalchemy import select
|
|
5
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
9
|
|
|
7
|
-
from dstack._internal.core.
|
|
10
|
+
from dstack._internal.core.models.instances import Resources
|
|
8
11
|
from dstack._internal.core.models.metrics import JobMetrics, Metric
|
|
9
|
-
from dstack._internal.server.models import JobMetricsPoint, JobModel
|
|
10
|
-
from dstack._internal.server.services.jobs import
|
|
12
|
+
from dstack._internal.server.models import JobMetricsPoint, JobModel
|
|
13
|
+
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
14
|
+
from dstack._internal.utils.common import get_or_error
|
|
15
|
+
from dstack._internal.utils.logging import get_logger
|
|
11
16
|
|
|
12
|
-
|
|
13
|
-
async def get_job_metrics(
|
|
14
|
-
session: AsyncSession,
|
|
15
|
-
project: ProjectModel,
|
|
16
|
-
run_name: str,
|
|
17
|
-
replica_num: int,
|
|
18
|
-
job_num: int,
|
|
19
|
-
) -> JobMetrics:
|
|
20
|
-
job_model = await get_run_job_model(
|
|
21
|
-
session=session,
|
|
22
|
-
project=project,
|
|
23
|
-
run_name=run_name,
|
|
24
|
-
replica_num=replica_num,
|
|
25
|
-
job_num=job_num,
|
|
26
|
-
)
|
|
27
|
-
if job_model is None:
|
|
28
|
-
raise ResourceNotExistsError("Found no job with given parameters")
|
|
29
|
-
job_metrics = await _get_job_metrics(
|
|
30
|
-
session=session,
|
|
31
|
-
job_model=job_model,
|
|
32
|
-
)
|
|
33
|
-
return job_metrics
|
|
17
|
+
logger = get_logger(__name__)
|
|
34
18
|
|
|
35
19
|
|
|
36
|
-
async def
|
|
20
|
+
async def get_job_metrics(
|
|
37
21
|
session: AsyncSession,
|
|
38
22
|
job_model: JobModel,
|
|
23
|
+
after: Optional[datetime] = None,
|
|
24
|
+
before: Optional[datetime] = None,
|
|
25
|
+
limit: Optional[int] = None,
|
|
39
26
|
) -> JobMetrics:
|
|
40
|
-
|
|
27
|
+
"""
|
|
28
|
+
Returns metrics ordered from the latest to the earliest.
|
|
29
|
+
|
|
30
|
+
Expected usage:
|
|
31
|
+
* limit=100 — get the latest 100 points
|
|
32
|
+
* after=<now - 1 hour> — get points for the last one hour
|
|
33
|
+
* before=<earliest timestamp from the last batch>, limit=100 — paginate back in history
|
|
34
|
+
"""
|
|
35
|
+
stmt = (
|
|
41
36
|
select(JobMetricsPoint)
|
|
42
37
|
.where(JobMetricsPoint.job_id == job_model.id)
|
|
43
38
|
.order_by(JobMetricsPoint.timestamp_micro.desc())
|
|
44
|
-
.limit(2)
|
|
45
39
|
)
|
|
40
|
+
if after is not None:
|
|
41
|
+
# we need +1 point for cpu_usage_percent, thus >=
|
|
42
|
+
stmt = stmt.where(JobMetricsPoint.timestamp_micro >= _datetime_to_unix_time_micro(after))
|
|
43
|
+
if before is not None:
|
|
44
|
+
stmt = stmt.where(JobMetricsPoint.timestamp_micro < _datetime_to_unix_time_micro(before))
|
|
45
|
+
if limit is not None:
|
|
46
|
+
# +1 for cpu_usage_percent
|
|
47
|
+
stmt = stmt.limit(limit + 1)
|
|
48
|
+
res = await session.execute(stmt)
|
|
46
49
|
points = res.scalars().all()
|
|
50
|
+
# we need at least 2 points to calculate cpu_usage_percent
|
|
47
51
|
if len(points) < 2:
|
|
48
52
|
return JobMetrics(metrics=[])
|
|
49
|
-
|
|
50
|
-
prev_point = points[1]
|
|
51
|
-
return _calculate_job_metrics(last_point, prev_point)
|
|
53
|
+
return _calculate_job_metrics(job_model, points)
|
|
52
54
|
|
|
53
55
|
|
|
54
|
-
def _calculate_job_metrics(
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
def _calculate_job_metrics(job_model: JobModel, points: Sequence[JobMetricsPoint]) -> JobMetrics:
|
|
57
|
+
timestamps: list[datetime] = []
|
|
58
|
+
cpu_usage_points: list[int] = []
|
|
59
|
+
memory_usage_points: list[int] = []
|
|
60
|
+
memory_working_set_points: list[int] = []
|
|
61
|
+
gpus_memory_usage_points: defaultdict[int, list[int]] = defaultdict(list)
|
|
62
|
+
gpus_util_points: defaultdict[int, list[int]] = defaultdict(list)
|
|
63
|
+
|
|
64
|
+
cpus_detected_num: Optional[int] = None
|
|
65
|
+
memory_total: Optional[int] = None
|
|
66
|
+
gpu_memory_total: Optional[int] = None
|
|
67
|
+
resources: Optional[Resources] = None
|
|
68
|
+
jrd = get_job_runtime_data(job_model)
|
|
69
|
+
if jrd is not None and jrd.offer is not None:
|
|
70
|
+
resources = jrd.offer.instance.resources
|
|
71
|
+
else:
|
|
72
|
+
jpd = get_job_provisioning_data(job_model)
|
|
73
|
+
if jpd is not None:
|
|
74
|
+
resources = jpd.instance_type.resources
|
|
75
|
+
if resources is not None:
|
|
76
|
+
cpus_detected_num = resources.cpus
|
|
77
|
+
memory_total = resources.memory_mib * 1024 * 1024
|
|
78
|
+
if len(resources.gpus) > 0:
|
|
79
|
+
gpu_memory_total = resources.gpus[0].memory_mib * 1024 * 1024
|
|
80
|
+
|
|
81
|
+
gpus_detected_num: Optional[int] = None
|
|
82
|
+
gpus_detected_num_mismatch: bool = False
|
|
83
|
+
for point, prev_point in zip(points, points[1:]):
|
|
84
|
+
timestamps.append(_unix_time_micro_to_datetime(point.timestamp_micro))
|
|
85
|
+
cpu_usage_points.append(_get_cpu_usage(point, prev_point))
|
|
86
|
+
memory_usage_points.append(point.memory_usage_bytes)
|
|
87
|
+
memory_working_set_points.append(point.memory_working_set_bytes)
|
|
88
|
+
gpus_memory_usage = json.loads(point.gpus_memory_usage_bytes)
|
|
89
|
+
gpus_util = json.loads(point.gpus_util_percent)
|
|
90
|
+
if gpus_detected_num is None:
|
|
91
|
+
gpus_detected_num = len(gpus_memory_usage)
|
|
92
|
+
if len(gpus_memory_usage) != gpus_detected_num or len(gpus_util) != gpus_detected_num:
|
|
93
|
+
gpus_detected_num_mismatch = True
|
|
94
|
+
if not gpus_detected_num_mismatch:
|
|
95
|
+
for i in range(gpus_detected_num):
|
|
96
|
+
gpus_memory_usage_points[i].append(gpus_memory_usage[i])
|
|
97
|
+
gpus_util_points[i].append(gpus_util[i])
|
|
98
|
+
|
|
99
|
+
metrics: list[Metric] = [
|
|
58
100
|
Metric(
|
|
59
101
|
name="cpu_usage_percent",
|
|
60
|
-
timestamps=
|
|
61
|
-
values=
|
|
62
|
-
)
|
|
63
|
-
)
|
|
64
|
-
metrics.append(
|
|
102
|
+
timestamps=timestamps,
|
|
103
|
+
values=cpu_usage_points,
|
|
104
|
+
),
|
|
65
105
|
Metric(
|
|
66
106
|
name="memory_usage_bytes",
|
|
67
|
-
timestamps=
|
|
68
|
-
values=
|
|
69
|
-
)
|
|
70
|
-
)
|
|
71
|
-
metrics.append(
|
|
107
|
+
timestamps=timestamps,
|
|
108
|
+
values=memory_usage_points,
|
|
109
|
+
),
|
|
72
110
|
Metric(
|
|
73
111
|
name="memory_working_set_bytes",
|
|
74
|
-
timestamps=
|
|
75
|
-
values=
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
)
|
|
89
|
-
for i in range(gpus_detected_num):
|
|
112
|
+
timestamps=timestamps,
|
|
113
|
+
values=memory_working_set_points,
|
|
114
|
+
),
|
|
115
|
+
]
|
|
116
|
+
if cpus_detected_num is not None:
|
|
117
|
+
metrics.append(_make_constant_metric("cpus_detected_num", timestamps, cpus_detected_num))
|
|
118
|
+
if memory_total is not None:
|
|
119
|
+
metrics.append(_make_constant_metric("memory_total_bytes", timestamps, memory_total))
|
|
120
|
+
if gpus_detected_num_mismatch:
|
|
121
|
+
# If number of GPUs changed in the time window, skip GPU metrics altogether, otherwise
|
|
122
|
+
# results can be unpredictable (e.g, one GPU takes place of another, as they are
|
|
123
|
+
# identified by an array index only).
|
|
124
|
+
logger.warning("gpus_detected_num mismatch, skipping GPU metrics")
|
|
125
|
+
else:
|
|
90
126
|
metrics.append(
|
|
91
|
-
|
|
92
|
-
name=f"gpu_memory_usage_bytes_gpu{i}",
|
|
93
|
-
timestamps=[timestamp],
|
|
94
|
-
values=[gpus_memory_usage_bytes[i]],
|
|
95
|
-
)
|
|
127
|
+
_make_constant_metric("gpus_detected_num", timestamps, get_or_error(gpus_detected_num))
|
|
96
128
|
)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
129
|
+
if gpu_memory_total is not None:
|
|
130
|
+
metrics.append(
|
|
131
|
+
_make_constant_metric("gpu_memory_total_bytes", timestamps, gpu_memory_total)
|
|
132
|
+
)
|
|
133
|
+
for index, gpu_memory_usage_points in gpus_memory_usage_points.items():
|
|
134
|
+
metrics.append(
|
|
135
|
+
Metric(
|
|
136
|
+
name=f"gpu_memory_usage_bytes_gpu{index}",
|
|
137
|
+
timestamps=timestamps,
|
|
138
|
+
values=gpu_memory_usage_points,
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
for index, gpu_util_points in gpus_util_points.items():
|
|
142
|
+
metrics.append(
|
|
143
|
+
Metric(
|
|
144
|
+
name=f"gpu_util_percent_gpu{index}",
|
|
145
|
+
timestamps=timestamps,
|
|
146
|
+
values=gpu_util_points,
|
|
147
|
+
)
|
|
102
148
|
)
|
|
103
|
-
)
|
|
104
149
|
return JobMetrics(metrics=metrics)
|
|
105
150
|
|
|
106
151
|
|
|
152
|
+
def _make_constant_metric(name: str, timestamps: list[datetime], value: float) -> Metric:
|
|
153
|
+
return Metric(
|
|
154
|
+
name=name,
|
|
155
|
+
timestamps=timestamps,
|
|
156
|
+
values=[value] * len(timestamps),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
107
160
|
def _get_cpu_usage(last_point: JobMetricsPoint, prev_point: JobMetricsPoint) -> int:
|
|
108
161
|
window = last_point.timestamp_micro - prev_point.timestamp_micro
|
|
162
|
+
if window == 0:
|
|
163
|
+
return 0
|
|
109
164
|
return round((last_point.cpu_usage_micro - prev_point.cpu_usage_micro) / window * 100)
|
|
110
165
|
|
|
111
166
|
|
|
112
167
|
def _unix_time_micro_to_datetime(unix_time_ms: int) -> datetime:
|
|
113
168
|
return datetime.fromtimestamp(unix_time_ms / 1_000_000, tz=timezone.utc)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _datetime_to_unix_time_micro(dt: datetime) -> int:
|
|
172
|
+
return int(dt.timestamp() * 1_000_000)
|
|
@@ -7,7 +7,7 @@ from dstack._internal.core.backends import (
|
|
|
7
7
|
BACKENDS_WITH_MULTINODE_SUPPORT,
|
|
8
8
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
9
9
|
)
|
|
10
|
-
from dstack._internal.core.backends.base import Backend
|
|
10
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
11
11
|
from dstack._internal.core.models.backends.base import BackendType
|
|
12
12
|
from dstack._internal.core.models.instances import (
|
|
13
13
|
InstanceOfferWithAvailability,
|
|
@@ -7,19 +7,22 @@ from sqlalchemy import func as safunc
|
|
|
7
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
8
|
from sqlalchemy.orm import joinedload
|
|
9
9
|
|
|
10
|
-
from dstack._internal.core.
|
|
11
|
-
from dstack._internal.core.models
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
DstackConfigInfo,
|
|
10
|
+
from dstack._internal.core.backends.configurators import get_configurator
|
|
11
|
+
from dstack._internal.core.backends.dstack.models import (
|
|
12
|
+
DstackBackendConfig,
|
|
13
|
+
DstackBaseBackendConfig,
|
|
15
14
|
)
|
|
15
|
+
from dstack._internal.core.backends.models import BackendInfo
|
|
16
|
+
from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
|
|
16
17
|
from dstack._internal.core.models.common import is_core_model_instance
|
|
17
18
|
from dstack._internal.core.models.projects import Member, MemberPermissions, Project
|
|
18
19
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
19
20
|
from dstack._internal.server.models import MemberModel, ProjectModel, UserModel
|
|
20
21
|
from dstack._internal.server.schemas.projects import MemberSetting
|
|
21
22
|
from dstack._internal.server.services import users
|
|
22
|
-
from dstack._internal.server.services.backends import
|
|
23
|
+
from dstack._internal.server.services.backends import (
|
|
24
|
+
get_backend_config_from_backend_model,
|
|
25
|
+
)
|
|
23
26
|
from dstack._internal.server.services.permissions import get_default_permissions
|
|
24
27
|
from dstack._internal.server.settings import DEFAULT_PROJECT_NAME
|
|
25
28
|
from dstack._internal.utils.common import get_current_datetime, run_async
|
|
@@ -176,12 +179,16 @@ async def set_project_members(
|
|
|
176
179
|
# FIXME: potentially long write transaction
|
|
177
180
|
# clear_project_members() issues DELETE without commit
|
|
178
181
|
await clear_project_members(session=session, project=project)
|
|
179
|
-
|
|
180
|
-
res = await session.execute(
|
|
182
|
+
names = [m.username for m in members]
|
|
183
|
+
res = await session.execute(
|
|
184
|
+
select(UserModel).where((UserModel.name.in_(names)) | (UserModel.email.in_(names)))
|
|
185
|
+
)
|
|
181
186
|
users = res.scalars().all()
|
|
187
|
+
# Create lookup maps for both username and email
|
|
182
188
|
username_to_user = {user.name: user for user in users}
|
|
189
|
+
email_to_user = {user.email: user for user in users if user.email}
|
|
183
190
|
for i, member in enumerate(members):
|
|
184
|
-
user_to_add = username_to_user.get(member.username)
|
|
191
|
+
user_to_add = username_to_user.get(member.username) or email_to_user.get(member.username)
|
|
185
192
|
if user_to_add is None:
|
|
186
193
|
continue
|
|
187
194
|
await add_project_member(
|
|
@@ -376,20 +383,22 @@ def project_model_to_project(
|
|
|
376
383
|
b.type.value,
|
|
377
384
|
)
|
|
378
385
|
continue
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
386
|
+
backend_config = get_backend_config_from_backend_model(
|
|
387
|
+
configurator, b, include_creds=False
|
|
388
|
+
)
|
|
389
|
+
if is_core_model_instance(backend_config, DstackBackendConfig):
|
|
390
|
+
for backend_type in backend_config.base_backends:
|
|
382
391
|
backends.append(
|
|
383
392
|
BackendInfo(
|
|
384
393
|
name=backend_type,
|
|
385
|
-
config=
|
|
394
|
+
config=DstackBaseBackendConfig(type=backend_type),
|
|
386
395
|
)
|
|
387
396
|
)
|
|
388
397
|
else:
|
|
389
398
|
backends.append(
|
|
390
399
|
BackendInfo(
|
|
391
400
|
name=b.type,
|
|
392
|
-
config=
|
|
401
|
+
config=backend_config,
|
|
393
402
|
)
|
|
394
403
|
)
|
|
395
404
|
return Project(
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from collections.abc import Generator, Iterable
|
|
3
|
+
from datetime import timezone
|
|
4
|
+
|
|
5
|
+
from prometheus_client import Metric
|
|
6
|
+
from prometheus_client.parser import text_string_to_metric_families
|
|
7
|
+
from prometheus_client.samples import Sample
|
|
8
|
+
from sqlalchemy import select
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
from sqlalchemy.orm import joinedload
|
|
11
|
+
|
|
12
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
13
|
+
from dstack._internal.core.models.runs import JobStatus, RunSpec
|
|
14
|
+
from dstack._internal.server.models import (
|
|
15
|
+
InstanceModel,
|
|
16
|
+
JobModel,
|
|
17
|
+
JobPrometheusMetrics,
|
|
18
|
+
ProjectModel,
|
|
19
|
+
RunModel,
|
|
20
|
+
)
|
|
21
|
+
from dstack._internal.server.services.instances import get_instance_offer
|
|
22
|
+
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
23
|
+
from dstack._internal.utils.common import get_current_datetime
|
|
24
|
+
|
|
25
|
+
_INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
|
|
26
|
+
_INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
|
|
27
|
+
_INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
|
|
28
|
+
_JOB_DURATION = "dstack_job_duration_seconds_total"
|
|
29
|
+
_JOB_PRICE = "dstack_job_price_dollars_per_hour"
|
|
30
|
+
_JOB_GPU_COUNT = "dstack_job_gpu_count"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def get_metrics(session: AsyncSession) -> str:
|
|
34
|
+
metrics_iter = itertools.chain(
|
|
35
|
+
await get_instance_metrics(session),
|
|
36
|
+
await get_job_metrics(session),
|
|
37
|
+
await get_job_gpu_metrics(session),
|
|
38
|
+
)
|
|
39
|
+
return "\n".join(_render_metrics(metrics_iter)) + "\n"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
43
|
+
res = await session.execute(
|
|
44
|
+
select(InstanceModel)
|
|
45
|
+
.join(ProjectModel)
|
|
46
|
+
.where(
|
|
47
|
+
InstanceModel.deleted == False,
|
|
48
|
+
InstanceModel.status.in_(
|
|
49
|
+
[
|
|
50
|
+
InstanceStatus.PROVISIONING,
|
|
51
|
+
InstanceStatus.IDLE,
|
|
52
|
+
InstanceStatus.BUSY,
|
|
53
|
+
InstanceStatus.TERMINATING,
|
|
54
|
+
]
|
|
55
|
+
),
|
|
56
|
+
)
|
|
57
|
+
.order_by(ProjectModel.name, InstanceModel.name)
|
|
58
|
+
.options(
|
|
59
|
+
joinedload(InstanceModel.project),
|
|
60
|
+
joinedload(InstanceModel.fleet),
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
instances = res.unique().scalars().all()
|
|
64
|
+
metrics: dict[str, Metric] = {
|
|
65
|
+
_INSTANCE_DURATION: Metric(
|
|
66
|
+
name=_INSTANCE_DURATION,
|
|
67
|
+
documentation="Total seconds the instance is running",
|
|
68
|
+
typ="counter",
|
|
69
|
+
),
|
|
70
|
+
_INSTANCE_PRICE: Metric(
|
|
71
|
+
name=_INSTANCE_PRICE, documentation="Instance price, USD/hour", typ="gauge"
|
|
72
|
+
),
|
|
73
|
+
_INSTANCE_GPU_COUNT: Metric(
|
|
74
|
+
name=_INSTANCE_GPU_COUNT, documentation="Instance GPU count", typ="gauge"
|
|
75
|
+
),
|
|
76
|
+
}
|
|
77
|
+
now = get_current_datetime()
|
|
78
|
+
for instance in instances:
|
|
79
|
+
fleet = instance.fleet
|
|
80
|
+
offer = get_instance_offer(instance)
|
|
81
|
+
gpu = ""
|
|
82
|
+
gpu_count = 0
|
|
83
|
+
if offer is not None and len(offer.instance.resources.gpus) > 0:
|
|
84
|
+
gpu = offer.instance.resources.gpus[0].name
|
|
85
|
+
gpu_count = len(offer.instance.resources.gpus)
|
|
86
|
+
labels: dict[str, str] = {
|
|
87
|
+
"dstack_project_name": instance.project.name,
|
|
88
|
+
"dstack_fleet_name": fleet.name if fleet is not None else "",
|
|
89
|
+
"dstack_fleet_id": str(fleet.id) if fleet is not None else "",
|
|
90
|
+
"dstack_instance_name": str(instance.name),
|
|
91
|
+
"dstack_instance_id": str(instance.id),
|
|
92
|
+
"dstack_instance_type": offer.instance.name if offer is not None else "",
|
|
93
|
+
"dstack_backend": instance.backend.value if instance.backend is not None else "",
|
|
94
|
+
"dstack_gpu": gpu,
|
|
95
|
+
}
|
|
96
|
+
duration = (now - instance.created_at.replace(tzinfo=timezone.utc)).total_seconds()
|
|
97
|
+
metrics[_INSTANCE_DURATION].add_sample(
|
|
98
|
+
name=_INSTANCE_DURATION, labels=labels, value=duration
|
|
99
|
+
)
|
|
100
|
+
metrics[_INSTANCE_PRICE].add_sample(
|
|
101
|
+
name=_INSTANCE_PRICE, labels=labels, value=instance.price or 0.0
|
|
102
|
+
)
|
|
103
|
+
metrics[_INSTANCE_GPU_COUNT].add_sample(
|
|
104
|
+
name=_INSTANCE_GPU_COUNT, labels=labels, value=gpu_count
|
|
105
|
+
)
|
|
106
|
+
return metrics.values()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
110
|
+
res = await session.execute(
|
|
111
|
+
select(JobModel)
|
|
112
|
+
.join(ProjectModel)
|
|
113
|
+
.where(
|
|
114
|
+
JobModel.status.in_(
|
|
115
|
+
[
|
|
116
|
+
JobStatus.PROVISIONING,
|
|
117
|
+
JobStatus.PULLING,
|
|
118
|
+
JobStatus.RUNNING,
|
|
119
|
+
JobStatus.TERMINATING,
|
|
120
|
+
]
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
.order_by(ProjectModel.name, JobModel.job_name)
|
|
124
|
+
.options(
|
|
125
|
+
joinedload(JobModel.project),
|
|
126
|
+
joinedload(JobModel.run).joinedload(RunModel.user),
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
jobs = res.scalars().all()
|
|
130
|
+
metrics: dict[str, Metric] = {
|
|
131
|
+
_JOB_DURATION: Metric(
|
|
132
|
+
name=_JOB_DURATION, documentation="Total seconds the job is running", typ="counter"
|
|
133
|
+
),
|
|
134
|
+
_JOB_PRICE: Metric(
|
|
135
|
+
name=_JOB_PRICE, documentation="Job instance price, USD/hour", typ="gauge"
|
|
136
|
+
),
|
|
137
|
+
_JOB_GPU_COUNT: Metric(name=_JOB_GPU_COUNT, documentation="Job GPU count", typ="gauge"),
|
|
138
|
+
}
|
|
139
|
+
now = get_current_datetime()
|
|
140
|
+
for job in jobs:
|
|
141
|
+
jpd = get_job_provisioning_data(job)
|
|
142
|
+
if jpd is None:
|
|
143
|
+
continue
|
|
144
|
+
jrd = get_job_runtime_data(job)
|
|
145
|
+
gpus = jpd.instance_type.resources.gpus
|
|
146
|
+
price = jpd.price
|
|
147
|
+
if jrd is not None and jrd.offer is not None:
|
|
148
|
+
gpus = jrd.offer.instance.resources.gpus
|
|
149
|
+
price = jrd.offer.price
|
|
150
|
+
run_spec = RunSpec.__response__.parse_raw(job.run.run_spec)
|
|
151
|
+
labels = _get_job_labels(job)
|
|
152
|
+
labels["dstack_run_type"] = run_spec.configuration.type
|
|
153
|
+
labels["dstack_backend"] = jpd.get_base_backend().value
|
|
154
|
+
labels["dstack_gpu"] = gpus[0].name if gpus else ""
|
|
155
|
+
duration = (now - job.submitted_at.replace(tzinfo=timezone.utc)).total_seconds()
|
|
156
|
+
metrics[_JOB_DURATION].add_sample(name=_JOB_DURATION, labels=labels, value=duration)
|
|
157
|
+
metrics[_JOB_PRICE].add_sample(name=_JOB_PRICE, labels=labels, value=price)
|
|
158
|
+
metrics[_JOB_GPU_COUNT].add_sample(name=_JOB_GPU_COUNT, labels=labels, value=len(gpus))
|
|
159
|
+
return metrics.values()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def get_job_gpu_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
163
|
+
res = await session.execute(
|
|
164
|
+
select(JobPrometheusMetrics)
|
|
165
|
+
.join(JobModel)
|
|
166
|
+
.join(ProjectModel)
|
|
167
|
+
.where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
168
|
+
.order_by(ProjectModel.name, JobModel.job_name)
|
|
169
|
+
.options(
|
|
170
|
+
joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
|
|
171
|
+
joinedload(JobPrometheusMetrics.job)
|
|
172
|
+
.joinedload(JobModel.run)
|
|
173
|
+
.joinedload(RunModel.user),
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
metrics_models = res.scalars().all()
|
|
177
|
+
return _parse_and_enrich_job_gpu_metrics(metrics_models)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def get_project_metrics(session: AsyncSession, project: ProjectModel) -> str:
|
|
181
|
+
res = await session.execute(
|
|
182
|
+
select(JobPrometheusMetrics)
|
|
183
|
+
.join(JobModel)
|
|
184
|
+
.where(
|
|
185
|
+
JobModel.project_id == project.id,
|
|
186
|
+
JobModel.status.in_([JobStatus.RUNNING]),
|
|
187
|
+
)
|
|
188
|
+
.order_by(JobModel.job_name)
|
|
189
|
+
.options(
|
|
190
|
+
joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
|
|
191
|
+
joinedload(JobPrometheusMetrics.job)
|
|
192
|
+
.joinedload(JobModel.run)
|
|
193
|
+
.joinedload(RunModel.user),
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
metrics_models = res.scalars().all()
|
|
197
|
+
return "\n".join(_render_metrics(_parse_and_enrich_job_gpu_metrics(metrics_models))) + "\n"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _parse_and_enrich_job_gpu_metrics(
|
|
201
|
+
metrics_models: Iterable[JobPrometheusMetrics],
|
|
202
|
+
) -> Iterable[Metric]:
|
|
203
|
+
metrics: dict[str, Metric] = {}
|
|
204
|
+
for metrics_model in metrics_models:
|
|
205
|
+
for metric in text_string_to_metric_families(metrics_model.text):
|
|
206
|
+
samples = metric.samples
|
|
207
|
+
metric.samples = []
|
|
208
|
+
name = metric.name
|
|
209
|
+
metric = metrics.setdefault(name, metric)
|
|
210
|
+
for sample in samples:
|
|
211
|
+
labels = sample.labels
|
|
212
|
+
labels.update(_get_job_labels(metrics_model.job))
|
|
213
|
+
# text_string_to_metric_families "fixes" counter names appending _total,
|
|
214
|
+
# we rebuild Sample to revert this
|
|
215
|
+
metric.samples.append(Sample(name, labels, *sample[2:]))
|
|
216
|
+
return metrics.values()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _get_job_labels(job: JobModel) -> dict[str, str]:
|
|
220
|
+
return {
|
|
221
|
+
"dstack_project_name": job.project.name,
|
|
222
|
+
"dstack_user_name": job.run.user.name,
|
|
223
|
+
"dstack_run_name": job.run_name,
|
|
224
|
+
"dstack_run_id": str(job.run_id),
|
|
225
|
+
"dstack_job_name": job.job_name,
|
|
226
|
+
"dstack_job_id": str(job.id),
|
|
227
|
+
"dstack_job_num": str(job.job_num),
|
|
228
|
+
"dstack_replica_num": str(job.replica_num),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _render_metrics(metrics: Iterable[Metric]) -> Generator[str, None, None]:
|
|
233
|
+
for metric in metrics:
|
|
234
|
+
if not metric.samples:
|
|
235
|
+
continue
|
|
236
|
+
yield f"# HELP {metric.name} {metric.documentation}"
|
|
237
|
+
yield f"# TYPE {metric.name} {metric.type}"
|
|
238
|
+
for sample in metric.samples:
|
|
239
|
+
parts: list[str] = [f"{sample.name}{{"]
|
|
240
|
+
parts.extend(",".join(f'{name}="{value}"' for name, value in sample.labels.items()))
|
|
241
|
+
parts.append(f"}} {float(sample.value)}")
|
|
242
|
+
# text_string_to_metric_families converts milliseconds to float seconds
|
|
243
|
+
if isinstance(sample.timestamp, float):
|
|
244
|
+
parts.append(f" {int(sample.timestamp * 1000)}")
|
|
245
|
+
yield "".join(parts)
|
|
@@ -178,9 +178,6 @@ class ShimClient:
|
|
|
178
178
|
# API v1 (a.k.a. Legacy API) — `/api/{submit,pull,stop}`
|
|
179
179
|
_API_V2_MIN_SHIM_VERSION = (0, 18, 34)
|
|
180
180
|
|
|
181
|
-
# A surrogate task ID for API-v1-over-v2 emulation (`_v2_compat_*` methods)
|
|
182
|
-
_LEGACY_TASK_ID = "00000000-0000-0000-0000-000000000000"
|
|
183
|
-
|
|
184
181
|
_shim_version: Optional["_Version"]
|
|
185
182
|
_api_version: int
|
|
186
183
|
_negotiated: bool = False
|
|
@@ -339,6 +336,20 @@ class ShimClient:
|
|
|
339
336
|
resp = self._request("GET", "/api/pull", raise_for_status=True)
|
|
340
337
|
return self._response(LegacyPullResponse, resp)
|
|
341
338
|
|
|
339
|
+
# Metrics
|
|
340
|
+
|
|
341
|
+
def get_task_metrics(self, task_id: "_TaskID") -> Optional[str]:
|
|
342
|
+
resp = self._request("GET", f"/metrics/tasks/{task_id}")
|
|
343
|
+
if resp.status_code == HTTPStatus.NOT_FOUND:
|
|
344
|
+
# Metrics exporter is not installed or old shim version
|
|
345
|
+
return None
|
|
346
|
+
if resp.status_code == HTTPStatus.BAD_GATEWAY:
|
|
347
|
+
# Metrics exporter is not available or returned an error
|
|
348
|
+
logger.info("failed to collect metrics for task %s: %s", task_id, resp.text)
|
|
349
|
+
return None
|
|
350
|
+
self._raise_for_status(resp)
|
|
351
|
+
return resp.text
|
|
352
|
+
|
|
342
353
|
# Private methods used for public methods implementations
|
|
343
354
|
|
|
344
355
|
def _request(
|