dstack 0.19.19__py3-none-any.whl → 0.19.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show
  1. dstack/_internal/core/backends/__init__.py +0 -65
  2. dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
  3. dstack/_internal/core/backends/features.py +64 -0
  4. dstack/_internal/core/backends/oci/resources.py +5 -5
  5. dstack/_internal/core/compatibility/fleets.py +2 -0
  6. dstack/_internal/core/compatibility/runs.py +4 -0
  7. dstack/_internal/core/models/profiles.py +37 -0
  8. dstack/_internal/server/app.py +22 -10
  9. dstack/_internal/server/background/__init__.py +5 -6
  10. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  11. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  12. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  13. dstack/_internal/server/background/tasks/process_instances.py +62 -48
  14. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  15. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  16. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  17. dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
  18. dstack/_internal/server/background/tasks/process_runs.py +63 -20
  19. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  20. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  21. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  22. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  23. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  24. dstack/_internal/server/models.py +16 -16
  25. dstack/_internal/server/schemas/logs.py +1 -9
  26. dstack/_internal/server/services/fleets.py +19 -10
  27. dstack/_internal/server/services/gateways/__init__.py +17 -17
  28. dstack/_internal/server/services/instances.py +10 -14
  29. dstack/_internal/server/services/jobs/__init__.py +10 -12
  30. dstack/_internal/server/services/logs/aws.py +45 -3
  31. dstack/_internal/server/services/logs/filelog.py +121 -11
  32. dstack/_internal/server/services/offers.py +3 -3
  33. dstack/_internal/server/services/projects.py +35 -15
  34. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  35. dstack/_internal/server/services/prometheus/custom_metrics.py +22 -3
  36. dstack/_internal/server/services/runs.py +74 -34
  37. dstack/_internal/server/services/services/__init__.py +4 -1
  38. dstack/_internal/server/services/users.py +2 -3
  39. dstack/_internal/server/services/volumes.py +11 -11
  40. dstack/_internal/server/settings.py +3 -0
  41. dstack/_internal/server/statics/index.html +1 -1
  42. dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js → main-39a767528976f8078166.js} +7 -26
  43. dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js.map → main-39a767528976f8078166.js.map} +1 -1
  44. dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
  45. dstack/_internal/server/testing/common.py +7 -0
  46. dstack/_internal/server/utils/sentry_utils.py +12 -0
  47. dstack/_internal/utils/common.py +10 -21
  48. dstack/_internal/utils/cron.py +5 -0
  49. dstack/version.py +1 -1
  50. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
  51. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/RECORD +54 -49
  52. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
  53. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
  54. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,7 +1,7 @@
1
1
  import asyncio
2
2
  import datetime
3
3
  import uuid
4
- from datetime import timedelta, timezone
4
+ from datetime import timedelta
5
5
  from functools import partial
6
6
  from typing import List, Optional, Sequence
7
7
 
@@ -11,16 +11,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
11
11
  from sqlalchemy.orm import selectinload
12
12
 
13
13
  import dstack._internal.utils.random_names as random_names
14
- from dstack._internal.core.backends import (
15
- BACKENDS_WITH_GATEWAY_SUPPORT,
16
- BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
17
- )
18
14
  from dstack._internal.core.backends.base.compute import (
19
15
  Compute,
20
16
  ComputeWithGatewaySupport,
21
17
  get_dstack_gateway_wheel,
22
18
  get_dstack_runner_version,
23
19
  )
20
+ from dstack._internal.core.backends.features import (
21
+ BACKENDS_WITH_GATEWAY_SUPPORT,
22
+ BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
23
+ )
24
24
  from dstack._internal.core.errors import (
25
25
  GatewayError,
26
26
  ResourceNotExistsError,
@@ -86,15 +86,6 @@ async def get_gateway_by_name(
86
86
  return gateway_model_to_gateway(gateway)
87
87
 
88
88
 
89
- async def get_project_default_gateway(
90
- session: AsyncSession, project: ProjectModel
91
- ) -> Optional[Gateway]:
92
- gateway: Optional[GatewayModel] = project.default_gateway
93
- if gateway is None:
94
- return None
95
- return gateway_model_to_gateway(gateway)
96
-
97
-
98
89
  async def create_gateway_compute(
99
90
  project_name: str,
100
91
  backend_compute: Compute,
@@ -181,9 +172,9 @@ async def create_gateway(
181
172
  session.add(gateway)
182
173
  await session.commit()
183
174
 
184
- if project.default_gateway is None or configuration.default:
175
+ default_gateway = await get_project_default_gateway_model(session=session, project=project)
176
+ if default_gateway is None or configuration.default:
185
177
  await set_default_gateway(session=session, project=project, name=configuration.name)
186
-
187
178
  return gateway_model_to_gateway(gateway)
188
179
 
189
180
 
@@ -349,6 +340,15 @@ async def get_project_gateway_model_by_name(
349
340
  return res.scalar()
350
341
 
351
342
 
343
+ async def get_project_default_gateway_model(
344
+ session: AsyncSession, project: ProjectModel
345
+ ) -> Optional[GatewayModel]:
346
+ res = await session.execute(
347
+ select(GatewayModel).where(GatewayModel.id == project.default_gateway_id)
348
+ )
349
+ return res.scalar_one_or_none()
350
+
351
+
352
352
  async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str:
353
353
  gateways = await list_project_gateway_models(session=session, project=project)
354
354
  names = {g.name for g in gateways}
@@ -557,7 +557,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway:
557
557
  region=gateway_model.region,
558
558
  wildcard_domain=gateway_model.wildcard_domain,
559
559
  default=gateway_model.project.default_gateway_id == gateway_model.id,
560
- created_at=gateway_model.created_at.replace(tzinfo=timezone.utc),
560
+ created_at=gateway_model.created_at,
561
561
  status=gateway_model.status,
562
562
  status_message=gateway_model.status_message,
563
563
  configuration=configuration,
@@ -1,6 +1,6 @@
1
1
  import uuid
2
2
  from collections.abc import Container, Iterable
3
- from datetime import datetime, timezone
3
+ from datetime import datetime
4
4
  from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import gpuhunt
@@ -8,11 +8,11 @@ from sqlalchemy import and_, or_, select
8
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
9
  from sqlalchemy.orm import joinedload
10
10
 
11
- from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT
12
11
  from dstack._internal.core.backends.base.offers import (
13
12
  offer_to_catalog_item,
14
13
  requirements_to_query_filter,
15
14
  )
15
+ from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
16
16
  from dstack._internal.core.models.backends.base import BackendType
17
17
  from dstack._internal.core.models.envs import Env
18
18
  from dstack._internal.core.models.instances import (
@@ -34,7 +34,6 @@ from dstack._internal.core.models.profiles import (
34
34
  TerminationPolicy,
35
35
  )
36
36
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
37
- from dstack._internal.core.models.users import GlobalRole
38
37
  from dstack._internal.core.models.volumes import Volume
39
38
  from dstack._internal.core.services.profiles import get_termination
40
39
  from dstack._internal.server.models import (
@@ -44,7 +43,7 @@ from dstack._internal.server.models import (
44
43
  UserModel,
45
44
  )
46
45
  from dstack._internal.server.services.offers import generate_shared_offer
47
- from dstack._internal.server.services.projects import list_project_models, list_user_project_models
46
+ from dstack._internal.server.services.projects import list_user_project_models
48
47
  from dstack._internal.utils import common as common_utils
49
48
  from dstack._internal.utils.logging import get_logger
50
49
 
@@ -62,7 +61,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
62
61
  status=instance_model.status,
63
62
  unreachable=instance_model.unreachable,
64
63
  termination_reason=instance_model.termination_reason,
65
- created=instance_model.created_at.replace(tzinfo=timezone.utc),
64
+ created=instance_model.created_at,
66
65
  total_blocks=instance_model.total_blocks,
67
66
  busy_blocks=instance_model.busy_blocks,
68
67
  )
@@ -372,18 +371,15 @@ async def list_user_instances(
372
371
  limit: int,
373
372
  ascending: bool,
374
373
  ) -> List[Instance]:
375
- if user.global_role == GlobalRole.ADMIN:
376
- projects = await list_project_models(session=session)
377
- else:
378
- projects = await list_user_project_models(session=session, user=user)
379
- if not projects:
380
- return []
381
-
374
+ projects = await list_user_project_models(
375
+ session=session,
376
+ user=user,
377
+ only_names=True,
378
+ )
382
379
  if project_names is not None:
383
- projects = [proj for proj in projects if proj.name in project_names]
380
+ projects = [p for p in projects if p.name in project_names]
384
381
  if len(projects) == 0:
385
382
  return []
386
-
387
383
  instance_models = await list_projects_instance_models(
388
384
  session=session,
389
385
  projects=projects,
@@ -1,13 +1,13 @@
1
1
  import itertools
2
2
  import json
3
- from datetime import timedelta, timezone
3
+ from datetime import timedelta
4
4
  from typing import Dict, Iterable, List, Optional, Tuple
5
5
  from uuid import UUID
6
6
 
7
7
  import requests
8
8
  from sqlalchemy import select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
- from sqlalchemy.orm import joinedload
10
+ from sqlalchemy.orm import joinedload, load_only
11
11
 
12
12
  import dstack._internal.server.services.backends as backends_services
13
13
  from dstack._internal.core.backends.base.backend import Backend
@@ -130,7 +130,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
130
130
  ):
131
131
  backend_data = json.loads(job_provisioning_data.backend_data)
132
132
  job_provisioning_data.backend = backend_data["base_backend"]
133
- last_processed_at = job_model.last_processed_at.replace(tzinfo=timezone.utc)
133
+ last_processed_at = job_model.last_processed_at
134
134
  finished_at = None
135
135
  if job_model.status.is_finished():
136
136
  finished_at = last_processed_at
@@ -140,7 +140,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
140
140
  id=job_model.id,
141
141
  submission_num=job_model.submission_num,
142
142
  deployment_num=job_model.deployment_num,
143
- submitted_at=job_model.submitted_at.replace(tzinfo=timezone.utc),
143
+ submitted_at=job_model.submitted_at,
144
144
  last_processed_at=last_processed_at,
145
145
  finished_at=finished_at,
146
146
  inactivity_secs=job_model.inactivity_secs,
@@ -231,10 +231,7 @@ async def process_terminating_job(
231
231
  Graceful stop should already be done by `process_terminating_run`.
232
232
  Caller must acquire the locks on the job and the job's instance.
233
233
  """
234
- if (
235
- job_model.remove_at is not None
236
- and job_model.remove_at.replace(tzinfo=timezone.utc) > common.get_current_datetime()
237
- ):
234
+ if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
238
235
  # it's too early to terminate the instance
239
236
  return
240
237
 
@@ -550,24 +547,25 @@ def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int
550
547
  return (
551
548
  job_model.volumes_detached_at is not None
552
549
  and common.get_current_datetime()
553
- > job_model.volumes_detached_at.replace(tzinfo=timezone.utc) + MIN_FORCE_DETACH_WAIT_PERIOD
550
+ > job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
554
551
  and (
555
552
  job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
556
553
  or stop_duration is not None
557
554
  and common.get_current_datetime()
558
- > job_model.volumes_detached_at.replace(tzinfo=timezone.utc)
559
- + timedelta(seconds=stop_duration)
555
+ > job_model.volumes_detached_at + timedelta(seconds=stop_duration)
560
556
  )
561
557
  )
562
558
 
563
559
 
564
560
  async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
565
561
  res = await session.execute(
566
- select(JobModel).where(
562
+ select(JobModel)
563
+ .where(
567
564
  JobModel.status == JobStatus.TERMINATING,
568
565
  JobModel.used_instance_id.is_not(None),
569
566
  JobModel.volumes_detached_at.is_not(None),
570
567
  )
568
+ .options(load_only(JobModel.used_instance_id))
571
569
  )
572
570
  job_models = res.scalars().all()
573
571
  return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
@@ -55,6 +55,8 @@ class CloudWatchLogStorage(LogStorage):
55
55
  PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
56
56
  # "None of the log events in the batch can be more than 2 hours in the future."
57
57
  FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
58
+ # Maximum number of retries when polling for log events to skip empty pages.
59
+ MAX_RETRIES = 10
58
60
 
59
61
  def __init__(self, *, group: str, region: Optional[str] = None) -> None:
60
62
  with self._wrap_boto_errors():
@@ -80,7 +82,7 @@ class CloudWatchLogStorage(LogStorage):
80
82
  next_token: Optional[str] = None
81
83
  with self._wrap_boto_errors():
82
84
  try:
83
- cw_events, next_token = self._get_log_events(stream, request)
85
+ cw_events, next_token = self._get_log_events_with_retry(stream, request)
84
86
  except botocore.exceptions.ClientError as e:
85
87
  if not self._is_resource_not_found_exception(e):
86
88
  raise
@@ -101,7 +103,47 @@ class CloudWatchLogStorage(LogStorage):
101
103
  )
102
104
  for cw_event in cw_events
103
105
  ]
104
- return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
106
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
107
+
108
+ def _get_log_events_with_retry(
109
+ self, stream: str, request: PollLogsRequest
110
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
111
+ current_request = request
112
+ previous_next_token = request.next_token
113
+
114
+ for attempt in range(self.MAX_RETRIES):
115
+ cw_events, next_token = self._get_log_events(stream, current_request)
116
+
117
+ if cw_events:
118
+ return cw_events, next_token
119
+
120
+ if not next_token or next_token == previous_next_token:
121
+ return [], None
122
+
123
+ previous_next_token = next_token
124
+ current_request = PollLogsRequest(
125
+ run_name=request.run_name,
126
+ job_submission_id=request.job_submission_id,
127
+ start_time=request.start_time,
128
+ end_time=request.end_time,
129
+ descending=request.descending,
130
+ next_token=next_token,
131
+ limit=request.limit,
132
+ diagnose=request.diagnose,
133
+ )
134
+
135
+ if not request.descending:
136
+ logger.debug(
137
+ "Stream %s: exhausted %d retries without finding logs, returning empty response",
138
+ stream,
139
+ self.MAX_RETRIES,
140
+ )
141
+ # Only return the next token after exhausting retries if going descending—
142
+ # AWS CloudWatch guarantees more logs in that case. In ascending mode,
143
+ # next token is always returned, even if no logs remain.
144
+ # So descending works reliably; ascending has limits if gaps are too large.
145
+ # In the future, UI/CLI should handle retries, and we can return next token for ascending too.
146
+ return [], next_token if request.descending else None
105
147
 
106
148
  def _get_log_events(
107
149
  self, stream: str, request: PollLogsRequest
@@ -115,7 +157,7 @@ class CloudWatchLogStorage(LogStorage):
115
157
  }
116
158
 
117
159
  if request.start_time:
118
- parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + 1
160
+ parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
119
161
 
120
162
  if request.end_time:
121
163
  parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
@@ -1,5 +1,6 @@
1
+ import os
1
2
  from pathlib import Path
2
- from typing import List, Union
3
+ from typing import Generator, List, Optional, Tuple, Union
3
4
  from uuid import UUID
4
5
 
5
6
  from dstack._internal.core.errors import ServerClientError
@@ -37,18 +38,17 @@ class FileLogStorage(LogStorage):
37
38
  producer=log_producer,
38
39
  )
39
40
 
41
+ if request.descending:
42
+ return self._poll_logs_descending(log_file_path, request)
43
+ else:
44
+ return self._poll_logs_ascending(log_file_path, request)
45
+
46
+ def _poll_logs_ascending(
47
+ self, log_file_path: Path, request: PollLogsRequest
48
+ ) -> JobSubmissionLogs:
40
49
  start_line = 0
41
50
  if request.next_token:
42
- try:
43
- start_line = int(request.next_token)
44
- if start_line < 0:
45
- raise ServerClientError(
46
- f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
47
- )
48
- except ValueError:
49
- raise ServerClientError(
50
- f"Invalid next_token: {request.next_token}. Must be a valid integer."
51
- )
51
+ start_line = self._next_token(request)
52
52
 
53
53
  logs = []
54
54
  next_token = None
@@ -94,6 +94,102 @@ class FileLogStorage(LogStorage):
94
94
 
95
95
  return JobSubmissionLogs(logs=logs, next_token=next_token)
96
96
 
97
+ def _poll_logs_descending(
98
+ self, log_file_path: Path, request: PollLogsRequest
99
+ ) -> JobSubmissionLogs:
100
+ start_offset = self._next_token(request)
101
+
102
+ candidate_logs = []
103
+
104
+ try:
105
+ line_generator = self._read_lines_reversed(log_file_path, start_offset)
106
+
107
+ for line_bytes, line_start_offset in line_generator:
108
+ try:
109
+ line_str = line_bytes.decode("utf-8")
110
+ log_event = LogEvent.__response__.parse_raw(line_str)
111
+ except Exception:
112
+ continue # Skip malformed lines
113
+
114
+ if request.end_time is not None and log_event.timestamp > request.end_time:
115
+ continue
116
+ if request.start_time and log_event.timestamp <= request.start_time:
117
+ break
118
+
119
+ candidate_logs.append((log_event, line_start_offset))
120
+
121
+ if len(candidate_logs) > request.limit:
122
+ break
123
+ except FileNotFoundError:
124
+ return JobSubmissionLogs(logs=[], next_token=None)
125
+
126
+ logs = [log for log, offset in candidate_logs[: request.limit]]
127
+ next_token = None
128
+ if len(candidate_logs) > request.limit:
129
+ # We fetched one more than the limit, so there are more pages.
130
+ # The next token should point to the start of the last log we are returning.
131
+ _last_log_event, last_log_offset = candidate_logs[request.limit - 1]
132
+ next_token = str(last_log_offset)
133
+
134
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
135
+
136
+ @staticmethod
137
+ def _read_lines_reversed(
138
+ filepath: Path, start_offset: Optional[int] = None, chunk_size: int = 8192
139
+ ) -> Generator[Tuple[bytes, int], None, None]:
140
+ """
141
+ A generator that yields lines from a file in reverse order, along with the byte
142
+ offset of the start of each line. This is memory-efficient for large files.
143
+ """
144
+ with open(filepath, "rb") as f:
145
+ f.seek(0, os.SEEK_END)
146
+ file_size = f.tell()
147
+ cursor = file_size
148
+
149
+ # If a start_offset is provided, optimize by starting the read
150
+ # from a more specific location instead of the end of the file.
151
+ if start_offset is not None and start_offset < file_size:
152
+ # To get the full content of the line that straddles the offset,
153
+ # we need to find its end (the next newline character).
154
+ f.seek(start_offset)
155
+ chunk = f.read(chunk_size)
156
+ newline_pos = chunk.find(b"\n")
157
+ if newline_pos != -1:
158
+ # Found the end of the line. The cursor for reverse reading
159
+ # should start from this point to include the full line.
160
+ cursor = start_offset + newline_pos + 1
161
+ else:
162
+ # No newline found, which means the rest of the file is one line.
163
+ # The default cursor pointing to file_size is correct.
164
+ pass
165
+
166
+ buffer = b""
167
+
168
+ while cursor > 0:
169
+ seek_pos = max(0, cursor - chunk_size)
170
+ amount_to_read = cursor - seek_pos
171
+ f.seek(seek_pos)
172
+ chunk = f.read(amount_to_read)
173
+ cursor = seek_pos
174
+
175
+ buffer = chunk + buffer
176
+
177
+ while b"\n" in buffer:
178
+ newline_pos = buffer.rfind(b"\n")
179
+ line = buffer[newline_pos + 1 :]
180
+ line_start_offset = cursor + newline_pos + 1
181
+
182
+ # Skip lines that start at or after the start_offset
183
+ if start_offset is None or line_start_offset < start_offset:
184
+ yield line, line_start_offset
185
+
186
+ buffer = buffer[:newline_pos]
187
+
188
+ # The remaining buffer is the first line of the file.
189
+ # Only yield it if we're not using start_offset or if it starts before start_offset
190
+ if buffer and (start_offset is None or 0 < start_offset):
191
+ yield buffer, 0
192
+
97
193
  def write_logs(
98
194
  self,
99
195
  project: ProjectModel,
@@ -148,3 +244,17 @@ class FileLogStorage(LogStorage):
148
244
  log_source=LogEventSource.STDOUT,
149
245
  message=runner_log_event.message.decode(errors="replace"),
150
246
  )
247
+
248
+ def _next_token(self, request: PollLogsRequest) -> Optional[int]:
249
+ next_token = request.next_token
250
+ if next_token is None:
251
+ return None
252
+ try:
253
+ value = int(next_token)
254
+ if value < 0:
255
+ raise ValueError("Offset must be non-negative")
256
+ return value
257
+ except (ValueError, TypeError):
258
+ raise ServerClientError(
259
+ f"Invalid next_token: {next_token}. Must be a non-negative integer."
260
+ )
@@ -2,13 +2,13 @@ from typing import List, Literal, Optional, Tuple, Union
2
2
 
3
3
  import gpuhunt
4
4
 
5
- from dstack._internal.core.backends import (
5
+ from dstack._internal.core.backends.base.backend import Backend
6
+ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
7
+ from dstack._internal.core.backends.features import (
6
8
  BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
7
9
  BACKENDS_WITH_MULTINODE_SUPPORT,
8
10
  BACKENDS_WITH_RESERVATION_SUPPORT,
9
11
  )
10
- from dstack._internal.core.backends.base.backend import Backend
11
- from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
12
12
  from dstack._internal.core.models.backends.base import BackendType
13
13
  from dstack._internal.core.models.instances import (
14
14
  InstanceOfferWithAvailability,
@@ -1,11 +1,10 @@
1
1
  import uuid
2
- from datetime import timezone
3
2
  from typing import Awaitable, Callable, List, Optional, Tuple
4
3
 
5
4
  from sqlalchemy import delete, select, update
6
5
  from sqlalchemy import func as safunc
7
6
  from sqlalchemy.ext.asyncio import AsyncSession
8
- from sqlalchemy.orm import joinedload
7
+ from sqlalchemy.orm import QueryableAttribute, joinedload, load_only
9
8
 
10
9
  from dstack._internal.core.backends.configurators import get_configurator
11
10
  from dstack._internal.core.backends.dstack.models import (
@@ -54,13 +53,12 @@ async def list_user_projects(
54
53
  user: UserModel,
55
54
  ) -> List[Project]:
56
55
  """
57
- Returns projects where the user is a member.
56
+ Returns projects where the user is a member or all projects for global admins.
58
57
  """
59
- if user.global_role == GlobalRole.ADMIN:
60
- projects = await list_project_models(session=session)
61
- else:
62
- projects = await list_user_project_models(session=session, user=user)
63
-
58
+ projects = await list_user_project_models(
59
+ session=session,
60
+ user=user,
61
+ )
64
62
  projects = sorted(projects, key=lambda p: p.created_at)
65
63
  return [
66
64
  project_model_to_project(p, include_backends=False, include_members=False)
@@ -80,7 +78,7 @@ async def list_user_accessible_projects(
80
78
  if user.global_role == GlobalRole.ADMIN:
81
79
  projects = await list_project_models(session=session)
82
80
  else:
83
- member_projects = await list_user_project_models(session=session, user=user)
81
+ member_projects = await list_member_project_models(session=session, user=user)
84
82
  public_projects = await list_public_non_member_project_models(session=session, user=user)
85
83
  projects = member_projects + public_projects
86
84
 
@@ -167,7 +165,7 @@ async def delete_projects(
167
165
  projects_names: List[str],
168
166
  ):
169
167
  if user.global_role != GlobalRole.ADMIN:
170
- user_projects = await list_user_project_models(
168
+ user_projects = await list_member_project_models(
171
169
  session=session, user=user, include_members=True
172
170
  )
173
171
  user_project_names = [p.name for p in user_projects]
@@ -339,9 +337,25 @@ async def clear_project_members(
339
337
 
340
338
 
341
339
  async def list_user_project_models(
340
+ session: AsyncSession,
341
+ user: UserModel,
342
+ only_names: bool = False,
343
+ ) -> List[ProjectModel]:
344
+ load_only_attrs = []
345
+ if only_names:
346
+ load_only_attrs += [ProjectModel.id, ProjectModel.name]
347
+ if user.global_role == GlobalRole.ADMIN:
348
+ return await list_project_models(session=session, load_only_attrs=load_only_attrs)
349
+ return await list_member_project_models(
350
+ session=session, user=user, load_only_attrs=load_only_attrs
351
+ )
352
+
353
+
354
+ async def list_member_project_models(
342
355
  session: AsyncSession,
343
356
  user: UserModel,
344
357
  include_members: bool = False,
358
+ load_only_attrs: Optional[List[QueryableAttribute]] = None,
345
359
  ) -> List[ProjectModel]:
346
360
  """
347
361
  List project models for a user where they are a member.
@@ -349,6 +363,8 @@ async def list_user_project_models(
349
363
  options = []
350
364
  if include_members:
351
365
  options.append(joinedload(ProjectModel.members))
366
+ if load_only_attrs:
367
+ options.append(load_only(*load_only_attrs))
352
368
  res = await session.execute(
353
369
  select(ProjectModel)
354
370
  .where(
@@ -395,13 +411,20 @@ async def list_user_owned_project_models(
395
411
 
396
412
  async def list_project_models(
397
413
  session: AsyncSession,
414
+ load_only_attrs: Optional[List[QueryableAttribute]] = None,
398
415
  ) -> List[ProjectModel]:
416
+ options = []
417
+ if load_only_attrs:
418
+ options.append(load_only(*load_only_attrs))
399
419
  res = await session.execute(
400
- select(ProjectModel).where(ProjectModel.deleted == False),
420
+ select(ProjectModel).where(ProjectModel.deleted == False).options(*options)
401
421
  )
402
422
  return list(res.scalars().all())
403
423
 
404
424
 
425
+ # TODO: Do not load ProjectModel.backends and ProjectModel.members by default when getting project
426
+
427
+
405
428
  async def get_project_model_by_name(
406
429
  session: AsyncSession, project_name: str, ignore_case: bool = True
407
430
  ) -> Optional[ProjectModel]:
@@ -415,7 +438,6 @@ async def get_project_model_by_name(
415
438
  .where(*filters)
416
439
  .options(joinedload(ProjectModel.backends))
417
440
  .options(joinedload(ProjectModel.members))
418
- .options(joinedload(ProjectModel.default_gateway))
419
441
  )
420
442
  return res.unique().scalar()
421
443
 
@@ -432,7 +454,6 @@ async def get_project_model_by_name_or_error(
432
454
  )
433
455
  .options(joinedload(ProjectModel.backends))
434
456
  .options(joinedload(ProjectModel.members))
435
- .options(joinedload(ProjectModel.default_gateway))
436
457
  )
437
458
  return res.unique().scalar_one()
438
459
 
@@ -449,7 +470,6 @@ async def get_project_model_by_id_or_error(
449
470
  )
450
471
  .options(joinedload(ProjectModel.backends))
451
472
  .options(joinedload(ProjectModel.members))
452
- .options(joinedload(ProjectModel.default_gateway))
453
473
  )
454
474
  return res.unique().scalar_one()
455
475
 
@@ -537,7 +557,7 @@ def project_model_to_project(
537
557
  project_id=project_model.id,
538
558
  project_name=project_model.name,
539
559
  owner=users.user_model_to_user(project_model.owner),
540
- created_at=project_model.created_at.replace(tzinfo=timezone.utc),
560
+ created_at=project_model.created_at,
541
561
  backends=backends,
542
562
  members=members,
543
563
  is_public=project_model.is_public,
@@ -5,6 +5,9 @@ class RunMetrics:
5
5
  """Wrapper class for run-related Prometheus metrics."""
6
6
 
7
7
  def __init__(self):
8
+ # submit_to_provision_duration reflects real provisioning time
9
+ # but does not reflect how quickly provisioning processing works
10
+ # since it includes scheduling time, retrying, etc.
8
11
  self._submit_to_provision_duration = Histogram(
9
12
  "dstack_submit_to_provision_duration_seconds",
10
13
  "Time from when a run has been submitted and first job provisioning",