dstack 0.19.15rc1__py3-none-any.whl → 0.19.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/secrets.py +92 -0
  2. dstack/_internal/cli/main.py +2 -0
  3. dstack/_internal/cli/services/completion.py +5 -0
  4. dstack/_internal/cli/services/configurators/run.py +59 -17
  5. dstack/_internal/cli/utils/secrets.py +25 -0
  6. dstack/_internal/core/backends/__init__.py +10 -4
  7. dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
  8. dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
  9. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  10. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  11. dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
  12. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  13. dstack/_internal/core/backends/configurators.py +9 -0
  14. dstack/_internal/core/backends/models.py +7 -0
  15. dstack/_internal/core/compatibility/logs.py +15 -0
  16. dstack/_internal/core/compatibility/runs.py +31 -2
  17. dstack/_internal/core/models/backends/base.py +2 -0
  18. dstack/_internal/core/models/configurations.py +33 -2
  19. dstack/_internal/core/models/files.py +67 -0
  20. dstack/_internal/core/models/logs.py +2 -1
  21. dstack/_internal/core/models/runs.py +24 -1
  22. dstack/_internal/core/models/secrets.py +9 -2
  23. dstack/_internal/server/app.py +2 -0
  24. dstack/_internal/server/background/tasks/process_fleets.py +1 -1
  25. dstack/_internal/server/background/tasks/process_gateways.py +1 -1
  26. dstack/_internal/server/background/tasks/process_instances.py +1 -1
  27. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  28. dstack/_internal/server/background/tasks/process_running_jobs.py +110 -13
  29. dstack/_internal/server/background/tasks/process_runs.py +36 -5
  30. dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
  31. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  32. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  33. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  34. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  35. dstack/_internal/server/models.py +33 -0
  36. dstack/_internal/server/routers/files.py +67 -0
  37. dstack/_internal/server/routers/gateways.py +6 -3
  38. dstack/_internal/server/routers/projects.py +63 -0
  39. dstack/_internal/server/routers/prometheus.py +5 -5
  40. dstack/_internal/server/routers/secrets.py +57 -15
  41. dstack/_internal/server/schemas/files.py +5 -0
  42. dstack/_internal/server/schemas/logs.py +10 -1
  43. dstack/_internal/server/schemas/projects.py +12 -0
  44. dstack/_internal/server/schemas/runner.py +2 -0
  45. dstack/_internal/server/schemas/secrets.py +7 -11
  46. dstack/_internal/server/security/permissions.py +75 -2
  47. dstack/_internal/server/services/backends/__init__.py +1 -1
  48. dstack/_internal/server/services/files.py +91 -0
  49. dstack/_internal/server/services/fleets.py +1 -1
  50. dstack/_internal/server/services/gateways/__init__.py +1 -1
  51. dstack/_internal/server/services/jobs/__init__.py +19 -8
  52. dstack/_internal/server/services/jobs/configurators/base.py +27 -3
  53. dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
  54. dstack/_internal/server/services/logs/aws.py +38 -38
  55. dstack/_internal/server/services/logs/filelog.py +48 -14
  56. dstack/_internal/server/services/logs/gcp.py +17 -16
  57. dstack/_internal/server/services/projects.py +164 -5
  58. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  59. dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
  60. dstack/_internal/server/services/proxy/repo.py +3 -0
  61. dstack/_internal/server/services/runner/client.py +8 -0
  62. dstack/_internal/server/services/runs.py +55 -10
  63. dstack/_internal/server/services/secrets.py +204 -0
  64. dstack/_internal/server/services/services/__init__.py +2 -1
  65. dstack/_internal/server/services/storage/base.py +21 -0
  66. dstack/_internal/server/services/storage/gcs.py +28 -6
  67. dstack/_internal/server/services/storage/s3.py +27 -9
  68. dstack/_internal/server/services/users.py +1 -3
  69. dstack/_internal/server/services/volumes.py +1 -1
  70. dstack/_internal/server/settings.py +2 -2
  71. dstack/_internal/server/statics/index.html +1 -1
  72. dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-d151637af20f70b2e796.js} +104 -48
  73. dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-d151637af20f70b2e796.js.map} +1 -1
  74. dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-d48635d8fe670d53961c.css} +1 -1
  75. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  76. dstack/_internal/server/testing/common.py +43 -5
  77. dstack/_internal/settings.py +5 -0
  78. dstack/_internal/utils/files.py +69 -0
  79. dstack/_internal/utils/nested_list.py +47 -0
  80. dstack/_internal/utils/path.py +12 -4
  81. dstack/api/_public/runs.py +73 -12
  82. dstack/api/server/__init__.py +6 -0
  83. dstack/api/server/_files.py +18 -0
  84. dstack/api/server/_logs.py +5 -1
  85. dstack/api/server/_projects.py +24 -0
  86. dstack/api/server/_secrets.py +15 -15
  87. dstack/version.py +1 -1
  88. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/METADATA +3 -4
  89. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/RECORD +93 -71
  90. /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
  91. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/licenses/LICENSE.md +0 -0
@@ -78,14 +78,22 @@ class CloudWatchLogStorage(LogStorage):
78
78
  project.name, request.run_name, request.job_submission_id, log_producer
79
79
  )
80
80
  cw_events: List[_CloudWatchLogEvent]
81
+ next_token: Optional[str] = None
81
82
  with self._wrap_boto_errors():
82
83
  try:
83
- cw_events = self._get_log_events(stream, request)
84
+ cw_events, next_token = self._get_log_events(stream, request)
84
85
  except botocore.exceptions.ClientError as e:
85
86
  if not self._is_resource_not_found_exception(e):
86
87
  raise
87
- logger.debug("Stream %s not found, returning dummy response", stream)
88
- cw_events = []
88
+ # Check if the group exists to distinguish between group not found vs stream not found
89
+ try:
90
+ self._check_group_exists(self._group)
91
+ # Group exists, so the error must be due to missing stream
92
+ logger.debug("Stream %s not found, returning dummy response", stream)
93
+ cw_events = []
94
+ except LogStorageError:
95
+ # Group doesn't exist, re-raise the LogStorageError
96
+ raise
89
97
  logs = [
90
98
  LogEvent(
91
99
  timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
@@ -94,51 +102,43 @@ class CloudWatchLogStorage(LogStorage):
94
102
  )
95
103
  for cw_event in cw_events
96
104
  ]
97
- return JobSubmissionLogs(logs=logs)
105
+ return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
98
106
 
99
- def _get_log_events(self, stream: str, request: PollLogsRequest) -> List[_CloudWatchLogEvent]:
100
- limit = request.limit
107
+ def _get_log_events(
108
+ self, stream: str, request: PollLogsRequest
109
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
110
+ start_from_head = not request.descending
101
111
  parameters = {
102
112
  "logGroupName": self._group,
103
113
  "logStreamName": stream,
104
- "limit": limit,
114
+ "limit": request.limit,
115
+ "startFromHead": start_from_head,
105
116
  }
106
- start_from_head = not request.descending
107
- parameters["startFromHead"] = start_from_head
117
+
108
118
  if request.start_time:
109
- # XXX: Since callers use start_time/end_time for pagination, one millisecond is added
110
- # to avoid an infinite loop because startTime boundary is inclusive.
111
119
  parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + 1
120
+
112
121
  if request.end_time:
113
- # No need to substract one millisecond in this case, though, seems that endTime is
114
- # exclusive, that is, time interval boundaries are [startTime, entTime)
115
122
  parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
116
- # "Partially full or empty pages don't necessarily mean that pagination is finished.
117
- # As long as the nextBackwardToken or nextForwardToken returned is NOT equal to the
118
- # nextToken that you passed into the API call, there might be more log events available."
119
- events: List[_CloudWatchLogEvent] = []
120
- next_token: Optional[str] = None
123
+ elif start_from_head:
124
+ # When startFromHead=true and no endTime is provided, set endTime to "now"
125
+ # to prevent infinite pagination as new logs arrive faster than we can read them
126
+ parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
127
+
128
+ if request.next_token:
129
+ parameters["nextToken"] = request.next_token
130
+
131
+ response = self._client.get_log_events(**parameters)
132
+
133
+ events = response.get("events", [])
121
134
  next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
122
- # Limit max tries to avoid a possible infinite loop if the API is misbehaving
123
- tries_left = 10
124
- while tries_left:
125
- if next_token is not None:
126
- parameters["nextToken"] = next_token
127
- response = self._client.get_log_events(**parameters)
128
- if start_from_head:
129
- events.extend(response["events"])
130
- else:
131
- # Regardless of the startFromHead value log events are arranged in
132
- # chronological order, from earliest to latest.
133
- events.extend(reversed(response["events"]))
134
- if len(events) >= limit:
135
- return events[:limit]
136
- if response[next_token_key] == next_token:
137
- return events
138
- next_token = response[next_token_key]
139
- tries_left -= 1
140
- logger.warning("too many requests to stream %s, returning partial response", stream)
141
- return events
135
+ next_token = response.get(next_token_key)
136
+
137
+ # TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
138
+ if request.descending:
139
+ events = list(reversed(events))
140
+
141
+ return events, next_token
142
142
 
143
143
  def write_logs(
144
144
  self,
@@ -14,6 +14,7 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
14
14
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
15
15
  from dstack._internal.server.services.logs.base import (
16
16
  LogStorage,
17
+ LogStorageError,
17
18
  b64encode_raw_message,
18
19
  unix_time_ms_to_datetime,
19
20
  )
@@ -29,7 +30,9 @@ class FileLogStorage(LogStorage):
29
30
  self.root = Path(root)
30
31
 
31
32
  def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
32
- # TODO Respect request.limit to support pagination
33
+ if request.descending:
34
+ raise LogStorageError("descending: true is not supported")
35
+
33
36
  log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
34
37
  log_file_path = self._get_log_file_path(
35
38
  project_name=project.name,
@@ -37,22 +40,53 @@ class FileLogStorage(LogStorage):
37
40
  job_submission_id=request.job_submission_id,
38
41
  producer=log_producer,
39
42
  )
43
+
44
+ start_line = 0
45
+ if request.next_token:
46
+ try:
47
+ start_line = int(request.next_token)
48
+ if start_line < 0:
49
+ raise LogStorageError(
50
+ f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
51
+ )
52
+ except ValueError:
53
+ raise LogStorageError(
54
+ f"Invalid next_token: {request.next_token}. Must be a valid integer."
55
+ )
56
+
40
57
  logs = []
58
+ next_token = None
59
+ current_line = 0
60
+
41
61
  try:
42
62
  with open(log_file_path) as f:
43
- for line in f:
44
- log_event = LogEvent.__response__.parse_raw(line)
45
- if request.start_time and log_event.timestamp <= request.start_time:
46
- continue
47
- if request.end_time is None or log_event.timestamp < request.end_time:
48
- logs.append(log_event)
49
- else:
50
- break
51
- except IOError:
52
- pass
53
- if request.descending:
54
- logs = list(reversed(logs))
55
- return JobSubmissionLogs(logs=logs)
63
+ lines = f.readlines()
64
+
65
+ for i, line in enumerate(lines):
66
+ if current_line < start_line:
67
+ current_line += 1
68
+ continue
69
+
70
+ log_event = LogEvent.__response__.parse_raw(line)
71
+ current_line += 1
72
+
73
+ if request.start_time and log_event.timestamp <= request.start_time:
74
+ continue
75
+ if request.end_time is not None and log_event.timestamp >= request.end_time:
76
+ break
77
+
78
+ logs.append(log_event)
79
+
80
+ if len(logs) >= request.limit:
81
+ # Only set next_token if there are more lines to read
82
+ if current_line < len(lines):
83
+ next_token = str(current_line)
84
+ break
85
+
86
+ except IOError as e:
87
+ raise LogStorageError(f"Failed to read log file {log_file_path}: {e}")
88
+
89
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
56
90
 
57
91
  def write_logs(
58
92
  self,
@@ -1,5 +1,4 @@
1
- import time
2
- from typing import Iterable, List
1
+ from typing import List
3
2
  from uuid import UUID
4
3
 
5
4
  from dstack._internal.core.errors import ServerClientError
@@ -25,7 +24,8 @@ GCP_LOGGING_AVAILABLE = True
25
24
  try:
26
25
  import google.api_core.exceptions
27
26
  import google.auth.exceptions
28
- from google.cloud import logging
27
+ from google.cloud import logging_v2
28
+ from google.cloud.logging_v2.types import ListLogEntriesRequest
29
29
  except ImportError:
30
30
  GCP_LOGGING_AVAILABLE = False
31
31
 
@@ -50,7 +50,7 @@ class GCPLogStorage(LogStorage):
50
50
 
51
51
  def __init__(self, project_id: str):
52
52
  try:
53
- self.client = logging.Client(project=project_id)
53
+ self.client = logging_v2.Client(project=project_id)
54
54
  self.logger = self.client.logger(name=self.LOG_NAME)
55
55
  self.logger.list_entries(max_results=1)
56
56
  # Python client doesn't seem to support dry_run,
@@ -64,6 +64,7 @@ class GCPLogStorage(LogStorage):
64
64
  raise LogStorageError("Insufficient permissions")
65
65
 
66
66
  def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
67
+ # TODO: GCP may return logs in random order when events have the same timestamp.
67
68
  producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
68
69
  stream_name = self._get_stream_name(
69
70
  project_name=project.name,
@@ -78,23 +79,27 @@ class GCPLogStorage(LogStorage):
78
79
  log_filters.append(f'timestamp < "{request.end_time.isoformat()}"')
79
80
  log_filter = " AND ".join(log_filters)
80
81
 
81
- order_by = logging.DESCENDING if request.descending else logging.ASCENDING
82
+ order_by = logging_v2.DESCENDING if request.descending else logging_v2.ASCENDING
82
83
  try:
83
- entries: Iterable[logging.LogEntry] = self.logger.list_entries(
84
- filter_=log_filter,
84
+ # Use low-level API to get access to next_page_token
85
+ request_obj = ListLogEntriesRequest(
86
+ resource_names=[f"projects/{self.client.project}"],
87
+ filter=log_filter,
85
88
  order_by=order_by,
86
- max_results=request.limit,
87
- # Specify max possible page_size (<=1000) to reduce number of API calls.
88
89
  page_size=request.limit,
90
+ page_token=request.next_token,
89
91
  )
92
+ response = self.client._logging_api._gapic_api.list_log_entries(request=request_obj)
93
+
90
94
  logs = [
91
95
  LogEvent(
92
96
  timestamp=entry.timestamp,
93
- message=entry.payload["message"],
97
+ message=entry.json_payload.get("message"),
94
98
  log_source=LogEventSource.STDOUT,
95
99
  )
96
- for entry in entries
100
+ for entry in response.entries
97
101
  ]
102
+ next_token = response.next_page_token or None
98
103
  except google.api_core.exceptions.ResourceExhausted as e:
99
104
  logger.warning("GCP Logging exception: %s", repr(e))
100
105
  # GCP Logging has severely low quota of 60 reads/min for entries.list
@@ -102,11 +107,7 @@ class GCPLogStorage(LogStorage):
102
107
  "GCP Logging read request limit exceeded."
103
108
  " It's recommended to increase default entries.list request quota from 60 per minute."
104
109
  )
105
- # We intentionally make reading logs slow to prevent hitting GCP quota.
106
- # This doesn't help with many concurrent clients but
107
- # should help with one client reading all logs sequentially.
108
- time.sleep(1)
109
- return JobSubmissionLogs(logs=logs)
110
+ return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
110
111
 
111
112
  def write_logs(
112
113
  self,
@@ -74,8 +74,8 @@ async def list_user_accessible_projects(
74
74
  ) -> List[Project]:
75
75
  """
76
76
  Returns all projects accessible to the user:
77
- - For global admins: ALL projects in the system
78
- - For regular users: Projects where user is a member + public projects where user is NOT a member
77
+ - Projects where user is a member (public or private)
78
+ - Public projects where user is NOT a member
79
79
  """
80
80
  if user.global_role == GlobalRole.ADMIN:
81
81
  projects = await list_project_models(session=session)
@@ -150,6 +150,17 @@ async def create_project(
150
150
  return project_model_to_project(project_model)
151
151
 
152
152
 
153
+ async def update_project(
154
+ session: AsyncSession,
155
+ user: UserModel,
156
+ project: ProjectModel,
157
+ is_public: bool,
158
+ ):
159
+ """Update project visibility (public/private)."""
160
+ project.is_public = is_public
161
+ await session.commit()
162
+
163
+
153
164
  async def delete_projects(
154
165
  session: AsyncSession,
155
166
  user: UserModel,
@@ -163,7 +174,8 @@ async def delete_projects(
163
174
  for project_name in projects_names:
164
175
  if project_name not in user_project_names:
165
176
  raise ForbiddenError()
166
- for project in user_projects:
177
+ projects_to_delete = [p for p in user_projects if p.name in projects_names]
178
+ for project in projects_to_delete:
167
179
  if not _is_project_admin(user=user, project=project):
168
180
  raise ForbiddenError()
169
181
  if all(name in projects_names for name in user_project_names):
@@ -187,7 +199,6 @@ async def set_project_members(
187
199
  project: ProjectModel,
188
200
  members: List[MemberSetting],
189
201
  ):
190
- # reload with members
191
202
  project = await get_project_model_by_name_or_error(
192
203
  session=session,
193
204
  project_name=project.name,
@@ -212,7 +223,6 @@ async def set_project_members(
212
223
  select(UserModel).where((UserModel.name.in_(names)) | (UserModel.email.in_(names)))
213
224
  )
214
225
  users = res.scalars().all()
215
- # Create lookup maps for both username and email
216
226
  username_to_user = {user.name: user for user in users}
217
227
  email_to_user = {user.email: user for user in users if user.email}
218
228
  for i, member in enumerate(members):
@@ -230,6 +240,77 @@ async def set_project_members(
230
240
  await session.commit()
231
241
 
232
242
 
243
+ async def add_project_members(
244
+ session: AsyncSession,
245
+ user: UserModel,
246
+ project: ProjectModel,
247
+ members: List[MemberSetting],
248
+ ):
249
+ """Add multiple members to a project."""
250
+ project = await get_project_model_by_name_or_error(
251
+ session=session,
252
+ project_name=project.name,
253
+ )
254
+ requesting_user_role = get_user_project_role(user=user, project=project)
255
+
256
+ is_self_join_to_public = (
257
+ len(members) == 1
258
+ and project.is_public
259
+ and (members[0].username == user.name or members[0].username == user.email)
260
+ and requesting_user_role is None
261
+ )
262
+
263
+ if not is_self_join_to_public:
264
+ if requesting_user_role not in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
265
+ raise ForbiddenError("Access denied: insufficient permissions to add members")
266
+
267
+ if user.global_role != GlobalRole.ADMIN and requesting_user_role == ProjectRole.MANAGER:
268
+ for member in members:
269
+ if member.project_role == ProjectRole.ADMIN:
270
+ raise ForbiddenError(
271
+ "Access denied: only global admins can add project admins"
272
+ )
273
+ else:
274
+ if members[0].project_role != ProjectRole.USER:
275
+ raise ForbiddenError("Access denied: can only join public projects as user role")
276
+
277
+ usernames = [member.username for member in members]
278
+
279
+ res = await session.execute(
280
+ select(UserModel).where((UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)))
281
+ )
282
+ users_found = res.scalars().all()
283
+
284
+ username_to_user = {user.name: user for user in users_found}
285
+ email_to_user = {user.email: user for user in users_found if user.email}
286
+
287
+ member_by_user_id = {m.user_id: m for m in project.members}
288
+
289
+ for member_setting in members:
290
+ user_to_add = username_to_user.get(member_setting.username) or email_to_user.get(
291
+ member_setting.username
292
+ )
293
+ if user_to_add is None:
294
+ raise ServerClientError(f"User not found: {member_setting.username}")
295
+
296
+ if user_to_add.id in member_by_user_id:
297
+ existing_member = member_by_user_id[user_to_add.id]
298
+ if existing_member.project_role != member_setting.project_role:
299
+ existing_member.project_role = member_setting.project_role
300
+ else:
301
+ await add_project_member(
302
+ session=session,
303
+ project=project,
304
+ user=user_to_add,
305
+ project_role=member_setting.project_role,
306
+ member_num=None,
307
+ commit=False,
308
+ )
309
+ member_by_user_id[user_to_add.id] = None
310
+
311
+ await session.commit()
312
+
313
+
233
314
  async def add_project_member(
234
315
  session: AsyncSession,
235
316
  project: ProjectModel,
@@ -497,8 +578,86 @@ def _is_project_admin(
497
578
  user: UserModel,
498
579
  project: ProjectModel,
499
580
  ) -> bool:
581
+ if user.id == project.owner_id:
582
+ return True
583
+
500
584
  for m in project.members:
501
585
  if user.id == m.user_id:
502
586
  if m.project_role == ProjectRole.ADMIN:
503
587
  return True
504
588
  return False
589
+
590
+
591
+ async def remove_project_members(
592
+ session: AsyncSession,
593
+ user: UserModel,
594
+ project: ProjectModel,
595
+ usernames: List[str],
596
+ ):
597
+ """Remove multiple members from a project."""
598
+ project = await get_project_model_by_name_or_error(
599
+ session=session,
600
+ project_name=project.name,
601
+ )
602
+ requesting_user_role = get_user_project_role(user=user, project=project)
603
+
604
+ is_self_leave = (
605
+ len(usernames) == 1
606
+ and (usernames[0] == user.name or usernames[0] == user.email)
607
+ and requesting_user_role is not None
608
+ )
609
+
610
+ if not is_self_leave:
611
+ if requesting_user_role not in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
612
+ raise ForbiddenError("Access denied: insufficient permissions to remove members")
613
+
614
+ res = await session.execute(
615
+ select(UserModel).where((UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)))
616
+ )
617
+ users_found = res.scalars().all()
618
+
619
+ username_to_user = {user.name: user for user in users_found}
620
+ email_to_user = {user.email: user for user in users_found if user.email}
621
+
622
+ member_by_user_id = {m.user_id: m for m in project.members}
623
+
624
+ members_to_remove = []
625
+ admin_removals = 0
626
+
627
+ for username in usernames:
628
+ user_to_remove = username_to_user.get(username) or email_to_user.get(username)
629
+ if user_to_remove is None:
630
+ raise ServerClientError(f"User not found: {username}")
631
+
632
+ if user_to_remove.id not in member_by_user_id:
633
+ raise ServerClientError(f"User is not a member of this project: {username}")
634
+
635
+ member_to_remove = member_by_user_id[user_to_remove.id]
636
+
637
+ if member_to_remove.project_role == ProjectRole.ADMIN:
638
+ if is_self_leave:
639
+ total_admins = sum(
640
+ 1 for member in project.members if member.project_role == ProjectRole.ADMIN
641
+ )
642
+ if total_admins <= 1:
643
+ raise ServerClientError("Cannot leave project: you are the last admin")
644
+ else:
645
+ if user.global_role != GlobalRole.ADMIN:
646
+ raise ForbiddenError(
647
+ f"Access denied: only global admins can remove project admins (user: {username})"
648
+ )
649
+ admin_removals += 1
650
+
651
+ members_to_remove.append(member_to_remove)
652
+
653
+ if not is_self_leave:
654
+ total_admins = sum(
655
+ 1 for member in project.members if member.project_role == ProjectRole.ADMIN
656
+ )
657
+ if admin_removals >= total_admins:
658
+ raise ServerClientError("Cannot remove all project admins")
659
+
660
+ for member in members_to_remove:
661
+ await session.delete(member)
662
+
663
+ await session.commit()
@@ -0,0 +1,52 @@
1
+ from prometheus_client import Counter, Histogram
2
+
3
+
4
+ class RunMetrics:
5
+ """Wrapper class for run-related Prometheus metrics."""
6
+
7
+ def __init__(self):
8
+ self._submit_to_provision_duration = Histogram(
9
+ "dstack_submit_to_provision_duration_seconds",
10
+ "Time from when a run has been submitted and first job provisioning",
11
+ # Buckets optimized for percentile calculation
12
+ buckets=[
13
+ 15,
14
+ 30,
15
+ 45,
16
+ 60,
17
+ 90,
18
+ 120,
19
+ 180,
20
+ 240,
21
+ 300,
22
+ 360,
23
+ 420,
24
+ 480,
25
+ 540,
26
+ 600,
27
+ 900,
28
+ 1200,
29
+ 1800,
30
+ float("inf"),
31
+ ],
32
+ labelnames=["project_name", "run_type"],
33
+ )
34
+
35
+ self._pending_runs_total = Counter(
36
+ "dstack_pending_runs_total",
37
+ "Number of pending runs",
38
+ labelnames=["project_name", "run_type"],
39
+ )
40
+
41
+ def log_submit_to_provision_duration(
42
+ self, duration_seconds: float, project_name: str, run_type: str
43
+ ):
44
+ self._submit_to_provision_duration.labels(
45
+ project_name=project_name, run_type=run_type
46
+ ).observe(duration_seconds)
47
+
48
+ def increment_pending_runs(self, project_name: str, run_type: str):
49
+ self._pending_runs_total.labels(project_name=project_name, run_type=run_type).inc()
50
+
51
+
52
+ run_metrics = RunMetrics()
@@ -7,6 +7,7 @@ from sqlalchemy.orm import joinedload
7
7
 
8
8
  import dstack._internal.server.services.jobs as jobs_services
9
9
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
10
+ from dstack._internal.core.models.backends.base import BackendType
10
11
  from dstack._internal.core.models.configurations import ServiceConfiguration
11
12
  from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
12
13
  from dstack._internal.core.models.runs import (
@@ -86,6 +87,8 @@ class ServerProxyRepo(BaseProxyRepo):
86
87
  username=jpd.username,
87
88
  port=jpd.ssh_port,
88
89
  )
90
+ if jpd.backend == BackendType.LOCAL:
91
+ ssh_proxy = None
89
92
  ssh_head_proxy: Optional[SSHConnectionParams] = None
90
93
  ssh_head_proxy_private_key: Optional[str] = None
91
94
  instance = get_or_error(job.instance)
@@ -109,6 +109,14 @@ class RunnerClient:
109
109
  )
110
110
  resp.raise_for_status()
111
111
 
112
+ def upload_archive(self, id: uuid.UUID, file: Union[BinaryIO, bytes]):
113
+ resp = requests.post(
114
+ self._url("/api/upload_archive"),
115
+ files={"archive": (str(id), file)},
116
+ timeout=UPLOAD_CODE_REQUEST_TIMEOUT,
117
+ )
118
+ resp.raise_for_status()
119
+
112
120
  def upload_code(self, file: Union[BinaryIO, bytes]):
113
121
  resp = requests.post(
114
122
  self._url("/api/upload_code"), data=file, timeout=UPLOAD_CODE_REQUEST_TIMEOUT