dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (115) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +11 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/cli/utils/run.py +11 -0
  17. dstack/_internal/core/backends/aws/compute.py +23 -10
  18. dstack/_internal/core/backends/aws/resources.py +3 -3
  19. dstack/_internal/core/backends/azure/compute.py +15 -9
  20. dstack/_internal/core/backends/azure/resources.py +2 -0
  21. dstack/_internal/core/backends/base/compute.py +102 -2
  22. dstack/_internal/core/backends/base/offers.py +7 -1
  23. dstack/_internal/core/backends/cudo/compute.py +8 -4
  24. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  25. dstack/_internal/core/backends/gcp/auth.py +19 -13
  26. dstack/_internal/core/backends/gcp/compute.py +26 -20
  27. dstack/_internal/core/backends/gcp/resources.py +3 -10
  28. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  29. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  30. dstack/_internal/core/backends/nebius/compute.py +2 -2
  31. dstack/_internal/core/backends/oci/compute.py +10 -4
  32. dstack/_internal/core/backends/runpod/compute.py +32 -7
  33. dstack/_internal/core/backends/runpod/config.py +8 -0
  34. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  35. dstack/_internal/core/backends/vastai/compute.py +12 -2
  36. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  37. dstack/_internal/core/backends/vultr/compute.py +9 -3
  38. dstack/_internal/core/models/backends/aws.py +2 -0
  39. dstack/_internal/core/models/backends/base.py +1 -0
  40. dstack/_internal/core/models/backends/runpod.py +2 -0
  41. dstack/_internal/core/models/configurations.py +2 -2
  42. dstack/_internal/core/models/profiles.py +46 -1
  43. dstack/_internal/core/models/runs.py +4 -0
  44. dstack/_internal/core/services/__init__.py +5 -1
  45. dstack/_internal/core/services/configs/__init__.py +3 -0
  46. dstack/_internal/server/app.py +11 -1
  47. dstack/_internal/server/background/__init__.py +10 -0
  48. dstack/_internal/server/background/tasks/common.py +22 -0
  49. dstack/_internal/server/background/tasks/process_instances.py +11 -18
  50. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
  51. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  52. dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
  53. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  54. dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
  55. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
  56. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  57. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  58. dstack/_internal/server/models.py +11 -0
  59. dstack/_internal/server/routers/logs.py +3 -0
  60. dstack/_internal/server/routers/metrics.py +21 -2
  61. dstack/_internal/server/routers/prometheus.py +36 -0
  62. dstack/_internal/server/security/permissions.py +1 -1
  63. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  64. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  65. dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
  66. dstack/_internal/server/services/config.py +24 -4
  67. dstack/_internal/server/services/fleets.py +1 -0
  68. dstack/_internal/server/services/gateways/__init__.py +1 -0
  69. dstack/_internal/server/services/jobs/__init__.py +12 -9
  70. dstack/_internal/server/services/jobs/configurators/base.py +9 -1
  71. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  72. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  73. dstack/_internal/server/services/logs/__init__.py +78 -0
  74. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  75. dstack/_internal/server/services/logs/base.py +47 -0
  76. dstack/_internal/server/services/logs/filelog.py +110 -0
  77. dstack/_internal/server/services/logs/gcp.py +165 -0
  78. dstack/_internal/server/services/metrics.py +103 -70
  79. dstack/_internal/server/services/pools.py +16 -17
  80. dstack/_internal/server/services/prometheus.py +87 -0
  81. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  82. dstack/_internal/server/services/runner/client.py +14 -3
  83. dstack/_internal/server/services/runs.py +43 -15
  84. dstack/_internal/server/services/volumes.py +1 -0
  85. dstack/_internal/server/settings.py +6 -0
  86. dstack/_internal/server/statics/index.html +1 -1
  87. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
  88. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
  89. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
  90. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  91. dstack/_internal/server/testing/common.py +50 -8
  92. dstack/api/_public/runs.py +4 -1
  93. dstack/api/server/_fleets.py +2 -0
  94. dstack/api/server/_runs.py +4 -0
  95. dstack/api/utils.py +3 -0
  96. dstack/version.py +2 -2
  97. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
  98. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
  99. tests/_internal/core/backends/base/__init__.py +0 -0
  100. tests/_internal/core/backends/base/test_compute.py +56 -0
  101. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
  102. tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
  103. tests/_internal/server/conftest.py +4 -5
  104. tests/_internal/server/routers/test_backends.py +1 -0
  105. tests/_internal/server/routers/test_fleets.py +2 -0
  106. tests/_internal/server/routers/test_logs.py +1 -1
  107. tests/_internal/server/routers/test_metrics.py +15 -0
  108. tests/_internal/server/routers/test_prometheus.py +244 -0
  109. tests/_internal/server/routers/test_runs.py +81 -58
  110. tests/_internal/server/services/test_logs.py +3 -3
  111. tests/_internal/server/services/test_metrics.py +163 -0
  112. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
  113. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
  114. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
  115. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
45
45
  InstanceOfferWithAvailability,
46
46
  InstanceRuntime,
47
47
  InstanceStatus,
48
- InstanceType,
49
48
  RemoteConnectionInfo,
50
49
  SSHKey,
51
50
  )
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
63
62
  Retry,
64
63
  )
65
64
  from dstack._internal.core.services.profiles import get_retry
65
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
66
66
  from dstack._internal.server.db import get_session_ctx
67
67
  from dstack._internal.server.models import (
68
68
  FleetModel,
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
695
695
 
696
696
  if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
697
697
  provisioning_deadline = _get_provisioning_deadline(
698
- instance, job_provisioning_data.instance_type
698
+ instance=instance,
699
+ job_provisioning_data=job_provisioning_data,
699
700
  )
700
701
  if get_current_datetime() > provisioning_deadline:
701
702
  instance.status = InstanceStatus.TERMINATING
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
737
738
  instance.name,
738
739
  )
739
740
  provisioning_deadline = _get_provisioning_deadline(
740
- instance, job_provisioning_data.instance_type
741
+ instance=instance,
742
+ job_provisioning_data=job_provisioning_data,
741
743
  )
742
744
  if get_current_datetime() > provisioning_deadline:
743
745
  logger.warning(
@@ -959,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
959
961
 
960
962
 
961
963
  def _get_provisioning_deadline(
962
- instance: InstanceModel, instance_type: InstanceType
964
+ instance: InstanceModel,
965
+ job_provisioning_data: JobProvisioningData,
963
966
  ) -> datetime.datetime:
964
- timeout_interval = _get_instance_timeout_interval(instance.backend, instance_type.name)
967
+ timeout_interval = get_provisioning_timeout(
968
+ backend_type=job_provisioning_data.get_base_backend(),
969
+ instance_type_name=job_provisioning_data.instance_type.name,
970
+ )
965
971
  return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
966
972
 
967
973
 
968
- def _get_instance_timeout_interval(
969
- backend_type: BackendType, instance_type_name: str
970
- ) -> timedelta:
971
- # when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
972
- if backend_type == BackendType.RUNPOD:
973
- return timedelta(seconds=1200)
974
- if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
975
- return timedelta(seconds=1200)
976
- if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
977
- return timedelta(seconds=3300)
978
- return timedelta(seconds=600)
979
-
980
-
981
974
  def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
982
975
  return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
@@ -28,6 +28,7 @@ async def process_placement_groups():
28
28
  PlacementGroupModel.deleted == False,
29
29
  PlacementGroupModel.id.not_in(lockset),
30
30
  )
31
+ .order_by(PlacementGroupModel.id) # take locks in order
31
32
  .with_for_update(skip_locked=True)
32
33
  )
33
34
  placement_group_models = res.scalars().all()
@@ -0,0 +1,135 @@
1
+ import uuid
2
+ from datetime import datetime, timedelta
3
+ from typing import Optional
4
+
5
+ import sqlalchemy.exc
6
+ from sqlalchemy import delete, or_, select, update
7
+ from sqlalchemy.orm import joinedload
8
+
9
+ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
10
+ from dstack._internal.core.models.runs import JobStatus
11
+ from dstack._internal.server.db import get_session_ctx
12
+ from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
13
+ from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
14
+ from dstack._internal.server.services.pools import get_instance_ssh_private_keys
15
+ from dstack._internal.server.services.runner import client
16
+ from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
17
+ from dstack._internal.server.utils.common import gather_map_async
18
+ from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
19
+ from dstack._internal.utils.logging import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ MAX_JOBS_FETCHED = 100
25
+ BATCH_SIZE = 10
26
+ MIN_COLLECT_INTERVAL_SECONDS = 9
27
+ # 10 minutes should be more than enough to scrape metrics, and, in any case,
28
+ # 10 minutes old metrics has little to no value
29
+ METRICS_TTL_SECONDS = 600
30
+
31
+
32
+ async def collect_prometheus_metrics():
33
+ now = get_current_datetime()
34
+ cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
35
+ async with get_session_ctx() as session:
36
+ res = await session.execute(
37
+ select(JobModel)
38
+ .join(JobPrometheusMetrics, isouter=True)
39
+ .where(
40
+ JobModel.status.in_([JobStatus.RUNNING]),
41
+ or_(
42
+ JobPrometheusMetrics.job_id.is_(None),
43
+ JobPrometheusMetrics.collected_at < cutoff,
44
+ ),
45
+ )
46
+ .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
47
+ .order_by(JobModel.last_processed_at.asc())
48
+ .limit(MAX_JOBS_FETCHED)
49
+ )
50
+ job_models = res.unique().scalars().all()
51
+ for batch in batched(job_models, BATCH_SIZE):
52
+ await _collect_jobs_metrics(batch, now)
53
+
54
+
55
+ async def delete_prometheus_metrics():
56
+ now = get_current_datetime()
57
+ cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
58
+ async with get_session_ctx() as session:
59
+ await session.execute(
60
+ delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
61
+ )
62
+ await session.commit()
63
+
64
+
65
+ async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
66
+ results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
67
+ async with get_session_ctx() as session:
68
+ for job_model, result in results:
69
+ if result is None:
70
+ continue
71
+ if isinstance(result, BaseException):
72
+ logger.error(
73
+ "Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
74
+ )
75
+ continue
76
+ res = await session.execute(
77
+ update(JobPrometheusMetrics)
78
+ .where(JobPrometheusMetrics.job_id == job_model.id)
79
+ .values(
80
+ collected_at=collected_at,
81
+ text=result,
82
+ )
83
+ .returning(JobPrometheusMetrics)
84
+ )
85
+ metrics = res.scalar()
86
+ if metrics is None:
87
+ metrics = JobPrometheusMetrics(
88
+ job_id=job_model.id,
89
+ collected_at=collected_at,
90
+ text=result,
91
+ )
92
+ try:
93
+ async with session.begin_nested():
94
+ session.add(metrics)
95
+ except sqlalchemy.exc.IntegrityError:
96
+ # Concurrent server replica already committed, ignoring
97
+ pass
98
+ await session.commit()
99
+
100
+
101
+ async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
102
+ ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
103
+ jpd = get_job_provisioning_data(job_model)
104
+ jrd = get_job_runtime_data(job_model)
105
+ if jpd is None:
106
+ return None
107
+ try:
108
+ res = await run_async(
109
+ _pull_job_metrics,
110
+ ssh_private_keys,
111
+ jpd,
112
+ jrd,
113
+ job_model.id,
114
+ )
115
+ except Exception:
116
+ logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
117
+ return None
118
+
119
+ if isinstance(res, bool):
120
+ logger.warning(
121
+ "Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
122
+ )
123
+ return None
124
+
125
+ if res is None:
126
+ # Either not supported by shim or exporter is not available
127
+ return None
128
+
129
+ return res
130
+
131
+
132
+ @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
133
+ def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
134
+ shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
135
+ return shim_client.get_task_metrics(task_id)
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from collections.abc import Iterable
2
3
  from datetime import timedelta
3
4
  from typing import Dict, List, Optional
4
5
 
@@ -16,11 +17,13 @@ from dstack._internal.core.models.instances import (
16
17
  RemoteConnectionInfo,
17
18
  SSHConnectionParams,
18
19
  )
20
+ from dstack._internal.core.models.metrics import Metric
19
21
  from dstack._internal.core.models.repos import RemoteRepoCreds
20
22
  from dstack._internal.core.models.runs import (
21
23
  ClusterInfo,
22
24
  Job,
23
25
  JobProvisioningData,
26
+ JobRuntimeData,
24
27
  JobSpec,
25
28
  JobStatus,
26
29
  JobTerminationReason,
@@ -28,6 +31,7 @@ from dstack._internal.core.models.runs import (
28
31
  RunSpec,
29
32
  )
30
33
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
34
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
31
35
  from dstack._internal.server.db import get_session_ctx
32
36
  from dstack._internal.server.models import (
33
37
  InstanceModel,
@@ -47,6 +51,7 @@ from dstack._internal.server.services.jobs import (
47
51
  )
48
52
  from dstack._internal.server.services.locking import get_locker
49
53
  from dstack._internal.server.services.logging import fmt
54
+ from dstack._internal.server.services.metrics import get_job_metrics
50
55
  from dstack._internal.server.services.pools import get_instance_ssh_private_keys
51
56
  from dstack._internal.server.services.repos import (
52
57
  get_code_model,
@@ -148,6 +153,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
148
153
  jobs=run.jobs,
149
154
  replica_num=job.job_spec.replica_num,
150
155
  job_provisioning_data=job_provisioning_data,
156
+ job_runtime_data=job_submission.job_runtime_data,
151
157
  )
152
158
 
153
159
  volumes = await get_job_attached_volumes(
@@ -242,7 +248,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
242
248
 
243
249
  if not success:
244
250
  # check timeout
245
- if job_submission.age > _get_runner_timeout_interval(
251
+ if job_submission.age > get_provisioning_timeout(
246
252
  backend_type=job_provisioning_data.get_base_backend(),
247
253
  instance_type_name=job_provisioning_data.instance_type.name,
248
254
  ):
@@ -341,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
341
347
  job_model.status = JobStatus.TERMINATING
342
348
  job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
343
349
 
350
+ if job_model.status == JobStatus.RUNNING:
351
+ await _check_gpu_utilization(session, job_model, job)
352
+
344
353
  job_model.last_processed_at = common_utils.get_current_datetime()
345
354
  await session.commit()
346
355
 
@@ -644,33 +653,74 @@ def _terminate_if_inactivity_duration_exceeded(
644
653
  run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
645
654
  ) -> None:
646
655
  conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
647
- if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
656
+ if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
648
657
  conf.inactivity_duration, int
649
658
  ):
650
- logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
651
- job_model.inactivity_secs = no_connections_secs
652
- if no_connections_secs is None:
653
- # TODO(0.19 or earlier): make no_connections_secs required
654
- job_model.status = JobStatus.TERMINATING
655
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
656
- job_model.termination_reason_message = (
657
- "The selected instance was created before dstack 0.18.41"
658
- " and does not support inactivity_duration"
659
- )
660
- elif no_connections_secs >= conf.inactivity_duration:
661
- job_model.status = JobStatus.TERMINATING
662
- # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
663
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
664
- job_model.termination_reason_message = (
665
- f"The job was inactive for {no_connections_secs} seconds,"
666
- f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
667
- )
659
+ # reset in case inactivity_duration was disabled via in-place update
660
+ job_model.inactivity_secs = None
661
+ return
662
+ logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
663
+ job_model.inactivity_secs = no_connections_secs
664
+ if no_connections_secs is None:
665
+ # TODO(0.19 or earlier): make no_connections_secs required
666
+ job_model.status = JobStatus.TERMINATING
667
+ job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
668
+ job_model.termination_reason_message = (
669
+ "The selected instance was created before dstack 0.18.41"
670
+ " and does not support inactivity_duration"
671
+ )
672
+ elif no_connections_secs >= conf.inactivity_duration:
673
+ job_model.status = JobStatus.TERMINATING
674
+ # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
675
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
676
+ job_model.termination_reason_message = (
677
+ f"The job was inactive for {no_connections_secs} seconds,"
678
+ f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
679
+ )
680
+
681
+
682
+ async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
683
+ policy = job.job_spec.utilization_policy
684
+ if policy is None:
685
+ return
686
+ after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
687
+ job_metrics = await get_job_metrics(session, job_model, after=after)
688
+ gpus_util_metrics: list[Metric] = []
689
+ for metric in job_metrics.metrics:
690
+ if metric.name.startswith("gpu_util_percent_gpu"):
691
+ gpus_util_metrics.append(metric)
692
+ if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
693
+ # Job has started recently, not enough points collected.
694
+ # Assuming that metrics collection interval less than 1 minute.
695
+ logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
696
+ return
697
+ if _should_terminate_due_to_low_gpu_util(
698
+ policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
699
+ ):
700
+ logger.info("%s: GPU utilization check: terminating", fmt(job_model))
701
+ job_model.status = JobStatus.TERMINATING
702
+ # TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
703
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
704
+ job_model.termination_reason_message = (
705
+ f"The job GPU utilization below {policy.min_gpu_utilization}%"
706
+ f" for {policy.time_window} seconds"
707
+ )
708
+ else:
709
+ logger.debug("%s: GPU utilization check: OK", fmt(job_model))
710
+
711
+
712
+ def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
713
+ for gpu_util in gpus_util:
714
+ if all(util < min_util for util in gpu_util):
715
+ return True
716
+ return False
668
717
 
669
718
 
670
719
  def _get_cluster_info(
671
720
  jobs: List[Job],
672
721
  replica_num: int,
673
722
  job_provisioning_data: JobProvisioningData,
723
+ job_runtime_data: Optional[JobRuntimeData],
674
724
  ) -> ClusterInfo:
675
725
  job_ips = []
676
726
  for job in jobs:
@@ -681,10 +731,13 @@ def _get_cluster_info(
681
731
  ).internal_ip
682
732
  or ""
683
733
  )
734
+ gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
735
+ if job_runtime_data is not None and job_runtime_data.offer is not None:
736
+ gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
684
737
  cluster_info = ClusterInfo(
685
738
  job_ips=job_ips,
686
739
  master_job_ip=job_ips[0],
687
- gpus_per_job=len(job_provisioning_data.instance_type.resources.gpus),
740
+ gpus_per_job=gpus_per_job,
688
741
  )
689
742
  return cluster_info
690
743
 
@@ -763,16 +816,3 @@ def _submit_job_to_runner(
763
816
  # do not log here, because the runner will send a new status
764
817
 
765
818
  return True
766
-
767
-
768
- def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
769
- # when changing timeouts, also consider process_instances._get_instance_timeout_interval
770
- if backend_type == BackendType.LAMBDA:
771
- return timedelta(seconds=1200)
772
- if backend_type == BackendType.KUBERNETES:
773
- return timedelta(seconds=1200)
774
- if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
775
- return timedelta(seconds=1200)
776
- if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
777
- return timedelta(seconds=3300)
778
- return timedelta(seconds=600)
@@ -74,6 +74,7 @@ async def _process_next_run():
74
74
  JobModel.run_id == run_model.id,
75
75
  JobModel.id.not_in(job_lockset),
76
76
  )
77
+ .order_by(JobModel.id) # take locks in order
77
78
  .with_for_update(skip_locked=True)
78
79
  )
79
80
  job_models = res.scalars().all()
@@ -35,6 +35,7 @@ from dstack._internal.core.models.runs import (
35
35
  )
36
36
  from dstack._internal.core.models.volumes import Volume
37
37
  from dstack._internal.core.services.profiles import get_termination
38
+ from dstack._internal.server import settings
38
39
  from dstack._internal.server.db import get_db, get_session_ctx
39
40
  from dstack._internal.server.models import (
40
41
  FleetModel,
@@ -195,6 +196,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
195
196
  InstanceModel.total_blocks > InstanceModel.busy_blocks,
196
197
  )
197
198
  .options(lazyload(InstanceModel.jobs))
199
+ .order_by(InstanceModel.id) # take locks in order
198
200
  .with_for_update()
199
201
  )
200
202
  pool_instances = list(res.unique().scalars().all())
@@ -319,6 +321,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
319
321
  select(VolumeModel)
320
322
  .where(VolumeModel.id.in_(volumes_ids))
321
323
  .options(selectinload(VolumeModel.user))
324
+ .order_by(VolumeModel.id) # take locks in order
322
325
  .with_for_update()
323
326
  )
324
327
  async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
@@ -450,7 +453,7 @@ async def _run_job_on_new_instance(
450
453
  )
451
454
  # Limit number of offers tried to prevent long-running processing
452
455
  # in case all offers fail.
453
- for backend, offer in offers[:15]:
456
+ for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
454
457
  logger.debug(
455
458
  "%s: trying %s in %s/%s for $%0.4f per hour",
456
459
  fmt(job_model),
@@ -11,7 +11,6 @@ from dstack._internal.server.models import (
11
11
  JobModel,
12
12
  ProjectModel,
13
13
  VolumeAttachmentModel,
14
- VolumeModel,
15
14
  )
16
15
  from dstack._internal.server.services.jobs import (
17
16
  process_terminating_job,
@@ -86,12 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
86
85
  .where(InstanceModel.id == job_model.used_instance_id)
87
86
  .options(
88
87
  joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
89
- joinedload(InstanceModel.volume_attachments)
90
- .joinedload(VolumeAttachmentModel.volume)
91
- .joinedload(VolumeModel.user),
92
- joinedload(InstanceModel.volume_attachments)
93
- .joinedload(VolumeAttachmentModel.volume)
94
- .joinedload(VolumeModel.attachments),
88
+ joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
95
89
  )
96
90
  )
97
91
  instance_model = res.unique().scalar()
@@ -0,0 +1,40 @@
1
+ """Add JobPrometheusMetrics
2
+
3
+ Revision ID: 60e444118b6d
4
+ Revises: a751ef183f27
5
+ Create Date: 2025-02-21 10:59:26.339353
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "60e444118b6d"
17
+ down_revision = "a751ef183f27"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.create_table(
25
+ "job_prometheus_metrics",
26
+ sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
27
+ sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
28
+ sa.Column("text", sa.Text(), nullable=False),
29
+ sa.ForeignKeyConstraint(
30
+ ["job_id"], ["jobs.id"], name=op.f("fk_job_prometheus_metrics_job_id_jobs")
31
+ ),
32
+ sa.PrimaryKeyConstraint("job_id", name=op.f("pk_job_prometheus_metrics")),
33
+ )
34
+ # ### end Alembic commands ###
35
+
36
+
37
+ def downgrade() -> None:
38
+ # ### commands auto generated by Alembic - please adjust! ###
39
+ op.drop_table("job_prometheus_metrics")
40
+ # ### end Alembic commands ###
@@ -0,0 +1,140 @@
1
+ """Add JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
2
+
3
+ Revision ID: 98d1b92988bc
4
+ Revises: 60e444118b6d
5
+ Create Date: 2025-02-28 15:12:37.649876
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from alembic_postgresql_enum import TableReference
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "98d1b92988bc"
15
+ down_revision = "60e444118b6d"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # SQLite
22
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
23
+ batch_op.alter_column(
24
+ "termination_reason",
25
+ existing_type=sa.VARCHAR(length=34),
26
+ type_=sa.Enum(
27
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
28
+ "INTERRUPTED_BY_NO_CAPACITY",
29
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
30
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
31
+ "TERMINATED_BY_USER",
32
+ "VOLUME_ERROR",
33
+ "GATEWAY_ERROR",
34
+ "SCALED_DOWN",
35
+ "DONE_BY_RUNNER",
36
+ "ABORTED_BY_USER",
37
+ "TERMINATED_BY_SERVER",
38
+ "INACTIVITY_DURATION_EXCEEDED",
39
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
40
+ "CONTAINER_EXITED_WITH_ERROR",
41
+ "PORTS_BINDING_FAILED",
42
+ "CREATING_CONTAINER_ERROR",
43
+ "EXECUTOR_ERROR",
44
+ "MAX_DURATION_EXCEEDED",
45
+ name="jobterminationreason",
46
+ ),
47
+ existing_nullable=True,
48
+ )
49
+ # PostgreSQL
50
+ op.sync_enum_values(
51
+ enum_schema="public",
52
+ enum_name="jobterminationreason",
53
+ new_values=[
54
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
55
+ "INTERRUPTED_BY_NO_CAPACITY",
56
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
57
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
58
+ "TERMINATED_BY_USER",
59
+ "VOLUME_ERROR",
60
+ "GATEWAY_ERROR",
61
+ "SCALED_DOWN",
62
+ "DONE_BY_RUNNER",
63
+ "ABORTED_BY_USER",
64
+ "TERMINATED_BY_SERVER",
65
+ "INACTIVITY_DURATION_EXCEEDED",
66
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
67
+ "CONTAINER_EXITED_WITH_ERROR",
68
+ "PORTS_BINDING_FAILED",
69
+ "CREATING_CONTAINER_ERROR",
70
+ "EXECUTOR_ERROR",
71
+ "MAX_DURATION_EXCEEDED",
72
+ ],
73
+ affected_columns=[
74
+ TableReference(
75
+ table_schema="public", table_name="jobs", column_name="termination_reason"
76
+ )
77
+ ],
78
+ enum_values_to_rename=[],
79
+ )
80
+
81
+
82
+ def downgrade() -> None:
83
+ # SQLite
84
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
85
+ batch_op.alter_column(
86
+ "termination_reason",
87
+ existing_type=sa.Enum(
88
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
89
+ "INTERRUPTED_BY_NO_CAPACITY",
90
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
91
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
92
+ "TERMINATED_BY_USER",
93
+ "VOLUME_ERROR",
94
+ "GATEWAY_ERROR",
95
+ "SCALED_DOWN",
96
+ "DONE_BY_RUNNER",
97
+ "ABORTED_BY_USER",
98
+ "TERMINATED_BY_SERVER",
99
+ "INACTIVITY_DURATION_EXCEEDED",
100
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
101
+ "CONTAINER_EXITED_WITH_ERROR",
102
+ "PORTS_BINDING_FAILED",
103
+ "CREATING_CONTAINER_ERROR",
104
+ "EXECUTOR_ERROR",
105
+ "MAX_DURATION_EXCEEDED",
106
+ name="jobterminationreason",
107
+ ),
108
+ type_=sa.VARCHAR(length=34),
109
+ existing_nullable=True,
110
+ )
111
+ # PostgreSQL
112
+ op.sync_enum_values(
113
+ enum_schema="public",
114
+ enum_name="jobterminationreason",
115
+ new_values=[
116
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
117
+ "INTERRUPTED_BY_NO_CAPACITY",
118
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
119
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
120
+ "TERMINATED_BY_USER",
121
+ "VOLUME_ERROR",
122
+ "GATEWAY_ERROR",
123
+ "SCALED_DOWN",
124
+ "DONE_BY_RUNNER",
125
+ "ABORTED_BY_USER",
126
+ "TERMINATED_BY_SERVER",
127
+ "INACTIVITY_DURATION_EXCEEDED",
128
+ "CONTAINER_EXITED_WITH_ERROR",
129
+ "PORTS_BINDING_FAILED",
130
+ "CREATING_CONTAINER_ERROR",
131
+ "EXECUTOR_ERROR",
132
+ "MAX_DURATION_EXCEEDED",
133
+ ],
134
+ affected_columns=[
135
+ TableReference(
136
+ table_schema="public", table_name="jobs", column_name="termination_reason"
137
+ )
138
+ ],
139
+ enum_values_to_rename=[],
140
+ )
@@ -648,3 +648,14 @@ class JobMetricsPoint(BaseModel):
648
648
  # json-encoded lists of metric values of len(gpus) length
649
649
  gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text)
650
650
  gpus_util_percent: Mapped[str] = mapped_column(Text)
651
+
652
+
653
+ class JobPrometheusMetrics(BaseModel):
654
+ __tablename__ = "job_prometheus_metrics"
655
+
656
+ job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
657
+ job: Mapped["JobModel"] = relationship()
658
+
659
+ collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
660
+ # Raw Prometheus text response
661
+ text: Mapped[str] = mapped_column(Text)
@@ -24,4 +24,7 @@ async def poll_logs(
24
24
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25
25
  ) -> JobSubmissionLogs:
26
26
  _, project = user_project
27
+ # The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
28
+ # Otherwise, some logs with duplicated timestamps may be filtered out.
29
+ # This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
27
30
  return await logs.poll_logs_async(project=project, request=body)