dstack 0.19.15rc1__py3-none-any.whl → 0.19.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/completion.py +5 -0
- dstack/_internal/cli/services/configurators/run.py +59 -17
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/core/backends/__init__.py +10 -4
- dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/compatibility/logs.py +15 -0
- dstack/_internal/core/compatibility/runs.py +31 -2
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +33 -2
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +24 -1
- dstack/_internal/core/models/secrets.py +9 -2
- dstack/_internal/server/app.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +1 -1
- dstack/_internal/server/background/tasks/process_gateways.py +1 -1
- dstack/_internal/server/background/tasks/process_instances.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +110 -13
- dstack/_internal/server/background/tasks/process_runs.py +36 -5
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/models.py +33 -0
- dstack/_internal/server/routers/files.py +67 -0
- dstack/_internal/server/routers/gateways.py +6 -3
- dstack/_internal/server/routers/projects.py +63 -0
- dstack/_internal/server/routers/prometheus.py +5 -5
- dstack/_internal/server/routers/secrets.py +57 -15
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/logs.py +10 -1
- dstack/_internal/server/schemas/projects.py +12 -0
- dstack/_internal/server/schemas/runner.py +2 -0
- dstack/_internal/server/schemas/secrets.py +7 -11
- dstack/_internal/server/security/permissions.py +75 -2
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1 -1
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/jobs/__init__.py +19 -8
- dstack/_internal/server/services/jobs/configurators/base.py +27 -3
- dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
- dstack/_internal/server/services/logs/aws.py +38 -38
- dstack/_internal/server/services/logs/filelog.py +48 -14
- dstack/_internal/server/services/logs/gcp.py +17 -16
- dstack/_internal/server/services/projects.py +164 -5
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
- dstack/_internal/server/services/proxy/repo.py +3 -0
- dstack/_internal/server/services/runner/client.py +8 -0
- dstack/_internal/server/services/runs.py +55 -10
- dstack/_internal/server/services/secrets.py +204 -0
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/storage/base.py +21 -0
- dstack/_internal/server/services/storage/gcs.py +28 -6
- dstack/_internal/server/services/storage/s3.py +27 -9
- dstack/_internal/server/services/users.py +1 -3
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/settings.py +2 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-d151637af20f70b2e796.js} +104 -48
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-d151637af20f70b2e796.js.map} +1 -1
- dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-d48635d8fe670d53961c.css} +1 -1
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/_internal/server/testing/common.py +43 -5
- dstack/_internal/settings.py +5 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/path.py +12 -4
- dstack/api/_public/runs.py +73 -12
- dstack/api/server/__init__.py +6 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_logs.py +5 -1
- dstack/api/server/_projects.py +24 -0
- dstack/api/server/_secrets.py +15 -15
- dstack/version.py +1 -1
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/METADATA +3 -4
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/RECORD +93 -71
- /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/WHEEL +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -78,14 +78,22 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
78
78
|
project.name, request.run_name, request.job_submission_id, log_producer
|
|
79
79
|
)
|
|
80
80
|
cw_events: List[_CloudWatchLogEvent]
|
|
81
|
+
next_token: Optional[str] = None
|
|
81
82
|
with self._wrap_boto_errors():
|
|
82
83
|
try:
|
|
83
|
-
cw_events = self._get_log_events(stream, request)
|
|
84
|
+
cw_events, next_token = self._get_log_events(stream, request)
|
|
84
85
|
except botocore.exceptions.ClientError as e:
|
|
85
86
|
if not self._is_resource_not_found_exception(e):
|
|
86
87
|
raise
|
|
87
|
-
|
|
88
|
-
|
|
88
|
+
# Check if the group exists to distinguish between group not found vs stream not found
|
|
89
|
+
try:
|
|
90
|
+
self._check_group_exists(self._group)
|
|
91
|
+
# Group exists, so the error must be due to missing stream
|
|
92
|
+
logger.debug("Stream %s not found, returning dummy response", stream)
|
|
93
|
+
cw_events = []
|
|
94
|
+
except LogStorageError:
|
|
95
|
+
# Group doesn't exist, re-raise the LogStorageError
|
|
96
|
+
raise
|
|
89
97
|
logs = [
|
|
90
98
|
LogEvent(
|
|
91
99
|
timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
|
|
@@ -94,51 +102,43 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
94
102
|
)
|
|
95
103
|
for cw_event in cw_events
|
|
96
104
|
]
|
|
97
|
-
return JobSubmissionLogs(logs=logs)
|
|
105
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
|
|
98
106
|
|
|
99
|
-
def _get_log_events(
|
|
100
|
-
|
|
107
|
+
def _get_log_events(
|
|
108
|
+
self, stream: str, request: PollLogsRequest
|
|
109
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
|
|
110
|
+
start_from_head = not request.descending
|
|
101
111
|
parameters = {
|
|
102
112
|
"logGroupName": self._group,
|
|
103
113
|
"logStreamName": stream,
|
|
104
|
-
"limit": limit,
|
|
114
|
+
"limit": request.limit,
|
|
115
|
+
"startFromHead": start_from_head,
|
|
105
116
|
}
|
|
106
|
-
|
|
107
|
-
parameters["startFromHead"] = start_from_head
|
|
117
|
+
|
|
108
118
|
if request.start_time:
|
|
109
|
-
# XXX: Since callers use start_time/end_time for pagination, one millisecond is added
|
|
110
|
-
# to avoid an infinite loop because startTime boundary is inclusive.
|
|
111
119
|
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + 1
|
|
120
|
+
|
|
112
121
|
if request.end_time:
|
|
113
|
-
# No need to substract one millisecond in this case, though, seems that endTime is
|
|
114
|
-
# exclusive, that is, time interval boundaries are [startTime, entTime)
|
|
115
122
|
parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
elif start_from_head:
|
|
124
|
+
# When startFromHead=true and no endTime is provided, set endTime to "now"
|
|
125
|
+
# to prevent infinite pagination as new logs arrive faster than we can read them
|
|
126
|
+
parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
|
|
127
|
+
|
|
128
|
+
if request.next_token:
|
|
129
|
+
parameters["nextToken"] = request.next_token
|
|
130
|
+
|
|
131
|
+
response = self._client.get_log_events(**parameters)
|
|
132
|
+
|
|
133
|
+
events = response.get("events", [])
|
|
121
134
|
next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
events.extend(response["events"])
|
|
130
|
-
else:
|
|
131
|
-
# Regardless of the startFromHead value log events are arranged in
|
|
132
|
-
# chronological order, from earliest to latest.
|
|
133
|
-
events.extend(reversed(response["events"]))
|
|
134
|
-
if len(events) >= limit:
|
|
135
|
-
return events[:limit]
|
|
136
|
-
if response[next_token_key] == next_token:
|
|
137
|
-
return events
|
|
138
|
-
next_token = response[next_token_key]
|
|
139
|
-
tries_left -= 1
|
|
140
|
-
logger.warning("too many requests to stream %s, returning partial response", stream)
|
|
141
|
-
return events
|
|
135
|
+
next_token = response.get(next_token_key)
|
|
136
|
+
|
|
137
|
+
# TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
|
|
138
|
+
if request.descending:
|
|
139
|
+
events = list(reversed(events))
|
|
140
|
+
|
|
141
|
+
return events, next_token
|
|
142
142
|
|
|
143
143
|
def write_logs(
|
|
144
144
|
self,
|
|
@@ -14,6 +14,7 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
|
14
14
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
15
15
|
from dstack._internal.server.services.logs.base import (
|
|
16
16
|
LogStorage,
|
|
17
|
+
LogStorageError,
|
|
17
18
|
b64encode_raw_message,
|
|
18
19
|
unix_time_ms_to_datetime,
|
|
19
20
|
)
|
|
@@ -29,7 +30,9 @@ class FileLogStorage(LogStorage):
|
|
|
29
30
|
self.root = Path(root)
|
|
30
31
|
|
|
31
32
|
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
32
|
-
|
|
33
|
+
if request.descending:
|
|
34
|
+
raise LogStorageError("descending: true is not supported")
|
|
35
|
+
|
|
33
36
|
log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
34
37
|
log_file_path = self._get_log_file_path(
|
|
35
38
|
project_name=project.name,
|
|
@@ -37,22 +40,53 @@ class FileLogStorage(LogStorage):
|
|
|
37
40
|
job_submission_id=request.job_submission_id,
|
|
38
41
|
producer=log_producer,
|
|
39
42
|
)
|
|
43
|
+
|
|
44
|
+
start_line = 0
|
|
45
|
+
if request.next_token:
|
|
46
|
+
try:
|
|
47
|
+
start_line = int(request.next_token)
|
|
48
|
+
if start_line < 0:
|
|
49
|
+
raise LogStorageError(
|
|
50
|
+
f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
|
|
51
|
+
)
|
|
52
|
+
except ValueError:
|
|
53
|
+
raise LogStorageError(
|
|
54
|
+
f"Invalid next_token: {request.next_token}. Must be a valid integer."
|
|
55
|
+
)
|
|
56
|
+
|
|
40
57
|
logs = []
|
|
58
|
+
next_token = None
|
|
59
|
+
current_line = 0
|
|
60
|
+
|
|
41
61
|
try:
|
|
42
62
|
with open(log_file_path) as f:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
63
|
+
lines = f.readlines()
|
|
64
|
+
|
|
65
|
+
for i, line in enumerate(lines):
|
|
66
|
+
if current_line < start_line:
|
|
67
|
+
current_line += 1
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
log_event = LogEvent.__response__.parse_raw(line)
|
|
71
|
+
current_line += 1
|
|
72
|
+
|
|
73
|
+
if request.start_time and log_event.timestamp <= request.start_time:
|
|
74
|
+
continue
|
|
75
|
+
if request.end_time is not None and log_event.timestamp >= request.end_time:
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
logs.append(log_event)
|
|
79
|
+
|
|
80
|
+
if len(logs) >= request.limit:
|
|
81
|
+
# Only set next_token if there are more lines to read
|
|
82
|
+
if current_line < len(lines):
|
|
83
|
+
next_token = str(current_line)
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
except IOError as e:
|
|
87
|
+
raise LogStorageError(f"Failed to read log file {log_file_path}: {e}")
|
|
88
|
+
|
|
89
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
56
90
|
|
|
57
91
|
def write_logs(
|
|
58
92
|
self,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Iterable, List
|
|
1
|
+
from typing import List
|
|
3
2
|
from uuid import UUID
|
|
4
3
|
|
|
5
4
|
from dstack._internal.core.errors import ServerClientError
|
|
@@ -25,7 +24,8 @@ GCP_LOGGING_AVAILABLE = True
|
|
|
25
24
|
try:
|
|
26
25
|
import google.api_core.exceptions
|
|
27
26
|
import google.auth.exceptions
|
|
28
|
-
from google.cloud import
|
|
27
|
+
from google.cloud import logging_v2
|
|
28
|
+
from google.cloud.logging_v2.types import ListLogEntriesRequest
|
|
29
29
|
except ImportError:
|
|
30
30
|
GCP_LOGGING_AVAILABLE = False
|
|
31
31
|
|
|
@@ -50,7 +50,7 @@ class GCPLogStorage(LogStorage):
|
|
|
50
50
|
|
|
51
51
|
def __init__(self, project_id: str):
|
|
52
52
|
try:
|
|
53
|
-
self.client =
|
|
53
|
+
self.client = logging_v2.Client(project=project_id)
|
|
54
54
|
self.logger = self.client.logger(name=self.LOG_NAME)
|
|
55
55
|
self.logger.list_entries(max_results=1)
|
|
56
56
|
# Python client doesn't seem to support dry_run,
|
|
@@ -64,6 +64,7 @@ class GCPLogStorage(LogStorage):
|
|
|
64
64
|
raise LogStorageError("Insufficient permissions")
|
|
65
65
|
|
|
66
66
|
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
67
|
+
# TODO: GCP may return logs in random order when events have the same timestamp.
|
|
67
68
|
producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
68
69
|
stream_name = self._get_stream_name(
|
|
69
70
|
project_name=project.name,
|
|
@@ -78,23 +79,27 @@ class GCPLogStorage(LogStorage):
|
|
|
78
79
|
log_filters.append(f'timestamp < "{request.end_time.isoformat()}"')
|
|
79
80
|
log_filter = " AND ".join(log_filters)
|
|
80
81
|
|
|
81
|
-
order_by =
|
|
82
|
+
order_by = logging_v2.DESCENDING if request.descending else logging_v2.ASCENDING
|
|
82
83
|
try:
|
|
83
|
-
|
|
84
|
-
|
|
84
|
+
# Use low-level API to get access to next_page_token
|
|
85
|
+
request_obj = ListLogEntriesRequest(
|
|
86
|
+
resource_names=[f"projects/{self.client.project}"],
|
|
87
|
+
filter=log_filter,
|
|
85
88
|
order_by=order_by,
|
|
86
|
-
max_results=request.limit,
|
|
87
|
-
# Specify max possible page_size (<=1000) to reduce number of API calls.
|
|
88
89
|
page_size=request.limit,
|
|
90
|
+
page_token=request.next_token,
|
|
89
91
|
)
|
|
92
|
+
response = self.client._logging_api._gapic_api.list_log_entries(request=request_obj)
|
|
93
|
+
|
|
90
94
|
logs = [
|
|
91
95
|
LogEvent(
|
|
92
96
|
timestamp=entry.timestamp,
|
|
93
|
-
message=entry.
|
|
97
|
+
message=entry.json_payload.get("message"),
|
|
94
98
|
log_source=LogEventSource.STDOUT,
|
|
95
99
|
)
|
|
96
|
-
for entry in entries
|
|
100
|
+
for entry in response.entries
|
|
97
101
|
]
|
|
102
|
+
next_token = response.next_page_token or None
|
|
98
103
|
except google.api_core.exceptions.ResourceExhausted as e:
|
|
99
104
|
logger.warning("GCP Logging exception: %s", repr(e))
|
|
100
105
|
# GCP Logging has severely low quota of 60 reads/min for entries.list
|
|
@@ -102,11 +107,7 @@ class GCPLogStorage(LogStorage):
|
|
|
102
107
|
"GCP Logging read request limit exceeded."
|
|
103
108
|
" It's recommended to increase default entries.list request quota from 60 per minute."
|
|
104
109
|
)
|
|
105
|
-
|
|
106
|
-
# This doesn't help with many concurrent clients but
|
|
107
|
-
# should help with one client reading all logs sequentially.
|
|
108
|
-
time.sleep(1)
|
|
109
|
-
return JobSubmissionLogs(logs=logs)
|
|
110
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
|
|
110
111
|
|
|
111
112
|
def write_logs(
|
|
112
113
|
self,
|
|
@@ -74,8 +74,8 @@ async def list_user_accessible_projects(
|
|
|
74
74
|
) -> List[Project]:
|
|
75
75
|
"""
|
|
76
76
|
Returns all projects accessible to the user:
|
|
77
|
-
-
|
|
78
|
-
-
|
|
77
|
+
- Projects where user is a member (public or private)
|
|
78
|
+
- Public projects where user is NOT a member
|
|
79
79
|
"""
|
|
80
80
|
if user.global_role == GlobalRole.ADMIN:
|
|
81
81
|
projects = await list_project_models(session=session)
|
|
@@ -150,6 +150,17 @@ async def create_project(
|
|
|
150
150
|
return project_model_to_project(project_model)
|
|
151
151
|
|
|
152
152
|
|
|
153
|
+
async def update_project(
|
|
154
|
+
session: AsyncSession,
|
|
155
|
+
user: UserModel,
|
|
156
|
+
project: ProjectModel,
|
|
157
|
+
is_public: bool,
|
|
158
|
+
):
|
|
159
|
+
"""Update project visibility (public/private)."""
|
|
160
|
+
project.is_public = is_public
|
|
161
|
+
await session.commit()
|
|
162
|
+
|
|
163
|
+
|
|
153
164
|
async def delete_projects(
|
|
154
165
|
session: AsyncSession,
|
|
155
166
|
user: UserModel,
|
|
@@ -163,7 +174,8 @@ async def delete_projects(
|
|
|
163
174
|
for project_name in projects_names:
|
|
164
175
|
if project_name not in user_project_names:
|
|
165
176
|
raise ForbiddenError()
|
|
166
|
-
for
|
|
177
|
+
projects_to_delete = [p for p in user_projects if p.name in projects_names]
|
|
178
|
+
for project in projects_to_delete:
|
|
167
179
|
if not _is_project_admin(user=user, project=project):
|
|
168
180
|
raise ForbiddenError()
|
|
169
181
|
if all(name in projects_names for name in user_project_names):
|
|
@@ -187,7 +199,6 @@ async def set_project_members(
|
|
|
187
199
|
project: ProjectModel,
|
|
188
200
|
members: List[MemberSetting],
|
|
189
201
|
):
|
|
190
|
-
# reload with members
|
|
191
202
|
project = await get_project_model_by_name_or_error(
|
|
192
203
|
session=session,
|
|
193
204
|
project_name=project.name,
|
|
@@ -212,7 +223,6 @@ async def set_project_members(
|
|
|
212
223
|
select(UserModel).where((UserModel.name.in_(names)) | (UserModel.email.in_(names)))
|
|
213
224
|
)
|
|
214
225
|
users = res.scalars().all()
|
|
215
|
-
# Create lookup maps for both username and email
|
|
216
226
|
username_to_user = {user.name: user for user in users}
|
|
217
227
|
email_to_user = {user.email: user for user in users if user.email}
|
|
218
228
|
for i, member in enumerate(members):
|
|
@@ -230,6 +240,77 @@ async def set_project_members(
|
|
|
230
240
|
await session.commit()
|
|
231
241
|
|
|
232
242
|
|
|
243
|
+
async def add_project_members(
|
|
244
|
+
session: AsyncSession,
|
|
245
|
+
user: UserModel,
|
|
246
|
+
project: ProjectModel,
|
|
247
|
+
members: List[MemberSetting],
|
|
248
|
+
):
|
|
249
|
+
"""Add multiple members to a project."""
|
|
250
|
+
project = await get_project_model_by_name_or_error(
|
|
251
|
+
session=session,
|
|
252
|
+
project_name=project.name,
|
|
253
|
+
)
|
|
254
|
+
requesting_user_role = get_user_project_role(user=user, project=project)
|
|
255
|
+
|
|
256
|
+
is_self_join_to_public = (
|
|
257
|
+
len(members) == 1
|
|
258
|
+
and project.is_public
|
|
259
|
+
and (members[0].username == user.name or members[0].username == user.email)
|
|
260
|
+
and requesting_user_role is None
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if not is_self_join_to_public:
|
|
264
|
+
if requesting_user_role not in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
|
|
265
|
+
raise ForbiddenError("Access denied: insufficient permissions to add members")
|
|
266
|
+
|
|
267
|
+
if user.global_role != GlobalRole.ADMIN and requesting_user_role == ProjectRole.MANAGER:
|
|
268
|
+
for member in members:
|
|
269
|
+
if member.project_role == ProjectRole.ADMIN:
|
|
270
|
+
raise ForbiddenError(
|
|
271
|
+
"Access denied: only global admins can add project admins"
|
|
272
|
+
)
|
|
273
|
+
else:
|
|
274
|
+
if members[0].project_role != ProjectRole.USER:
|
|
275
|
+
raise ForbiddenError("Access denied: can only join public projects as user role")
|
|
276
|
+
|
|
277
|
+
usernames = [member.username for member in members]
|
|
278
|
+
|
|
279
|
+
res = await session.execute(
|
|
280
|
+
select(UserModel).where((UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)))
|
|
281
|
+
)
|
|
282
|
+
users_found = res.scalars().all()
|
|
283
|
+
|
|
284
|
+
username_to_user = {user.name: user for user in users_found}
|
|
285
|
+
email_to_user = {user.email: user for user in users_found if user.email}
|
|
286
|
+
|
|
287
|
+
member_by_user_id = {m.user_id: m for m in project.members}
|
|
288
|
+
|
|
289
|
+
for member_setting in members:
|
|
290
|
+
user_to_add = username_to_user.get(member_setting.username) or email_to_user.get(
|
|
291
|
+
member_setting.username
|
|
292
|
+
)
|
|
293
|
+
if user_to_add is None:
|
|
294
|
+
raise ServerClientError(f"User not found: {member_setting.username}")
|
|
295
|
+
|
|
296
|
+
if user_to_add.id in member_by_user_id:
|
|
297
|
+
existing_member = member_by_user_id[user_to_add.id]
|
|
298
|
+
if existing_member.project_role != member_setting.project_role:
|
|
299
|
+
existing_member.project_role = member_setting.project_role
|
|
300
|
+
else:
|
|
301
|
+
await add_project_member(
|
|
302
|
+
session=session,
|
|
303
|
+
project=project,
|
|
304
|
+
user=user_to_add,
|
|
305
|
+
project_role=member_setting.project_role,
|
|
306
|
+
member_num=None,
|
|
307
|
+
commit=False,
|
|
308
|
+
)
|
|
309
|
+
member_by_user_id[user_to_add.id] = None
|
|
310
|
+
|
|
311
|
+
await session.commit()
|
|
312
|
+
|
|
313
|
+
|
|
233
314
|
async def add_project_member(
|
|
234
315
|
session: AsyncSession,
|
|
235
316
|
project: ProjectModel,
|
|
@@ -497,8 +578,86 @@ def _is_project_admin(
|
|
|
497
578
|
user: UserModel,
|
|
498
579
|
project: ProjectModel,
|
|
499
580
|
) -> bool:
|
|
581
|
+
if user.id == project.owner_id:
|
|
582
|
+
return True
|
|
583
|
+
|
|
500
584
|
for m in project.members:
|
|
501
585
|
if user.id == m.user_id:
|
|
502
586
|
if m.project_role == ProjectRole.ADMIN:
|
|
503
587
|
return True
|
|
504
588
|
return False
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
async def remove_project_members(
|
|
592
|
+
session: AsyncSession,
|
|
593
|
+
user: UserModel,
|
|
594
|
+
project: ProjectModel,
|
|
595
|
+
usernames: List[str],
|
|
596
|
+
):
|
|
597
|
+
"""Remove multiple members from a project."""
|
|
598
|
+
project = await get_project_model_by_name_or_error(
|
|
599
|
+
session=session,
|
|
600
|
+
project_name=project.name,
|
|
601
|
+
)
|
|
602
|
+
requesting_user_role = get_user_project_role(user=user, project=project)
|
|
603
|
+
|
|
604
|
+
is_self_leave = (
|
|
605
|
+
len(usernames) == 1
|
|
606
|
+
and (usernames[0] == user.name or usernames[0] == user.email)
|
|
607
|
+
and requesting_user_role is not None
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
if not is_self_leave:
|
|
611
|
+
if requesting_user_role not in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
|
|
612
|
+
raise ForbiddenError("Access denied: insufficient permissions to remove members")
|
|
613
|
+
|
|
614
|
+
res = await session.execute(
|
|
615
|
+
select(UserModel).where((UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)))
|
|
616
|
+
)
|
|
617
|
+
users_found = res.scalars().all()
|
|
618
|
+
|
|
619
|
+
username_to_user = {user.name: user for user in users_found}
|
|
620
|
+
email_to_user = {user.email: user for user in users_found if user.email}
|
|
621
|
+
|
|
622
|
+
member_by_user_id = {m.user_id: m for m in project.members}
|
|
623
|
+
|
|
624
|
+
members_to_remove = []
|
|
625
|
+
admin_removals = 0
|
|
626
|
+
|
|
627
|
+
for username in usernames:
|
|
628
|
+
user_to_remove = username_to_user.get(username) or email_to_user.get(username)
|
|
629
|
+
if user_to_remove is None:
|
|
630
|
+
raise ServerClientError(f"User not found: {username}")
|
|
631
|
+
|
|
632
|
+
if user_to_remove.id not in member_by_user_id:
|
|
633
|
+
raise ServerClientError(f"User is not a member of this project: {username}")
|
|
634
|
+
|
|
635
|
+
member_to_remove = member_by_user_id[user_to_remove.id]
|
|
636
|
+
|
|
637
|
+
if member_to_remove.project_role == ProjectRole.ADMIN:
|
|
638
|
+
if is_self_leave:
|
|
639
|
+
total_admins = sum(
|
|
640
|
+
1 for member in project.members if member.project_role == ProjectRole.ADMIN
|
|
641
|
+
)
|
|
642
|
+
if total_admins <= 1:
|
|
643
|
+
raise ServerClientError("Cannot leave project: you are the last admin")
|
|
644
|
+
else:
|
|
645
|
+
if user.global_role != GlobalRole.ADMIN:
|
|
646
|
+
raise ForbiddenError(
|
|
647
|
+
f"Access denied: only global admins can remove project admins (user: {username})"
|
|
648
|
+
)
|
|
649
|
+
admin_removals += 1
|
|
650
|
+
|
|
651
|
+
members_to_remove.append(member_to_remove)
|
|
652
|
+
|
|
653
|
+
if not is_self_leave:
|
|
654
|
+
total_admins = sum(
|
|
655
|
+
1 for member in project.members if member.project_role == ProjectRole.ADMIN
|
|
656
|
+
)
|
|
657
|
+
if admin_removals >= total_admins:
|
|
658
|
+
raise ServerClientError("Cannot remove all project admins")
|
|
659
|
+
|
|
660
|
+
for member in members_to_remove:
|
|
661
|
+
await session.delete(member)
|
|
662
|
+
|
|
663
|
+
await session.commit()
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from prometheus_client import Counter, Histogram
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RunMetrics:
|
|
5
|
+
"""Wrapper class for run-related Prometheus metrics."""
|
|
6
|
+
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self._submit_to_provision_duration = Histogram(
|
|
9
|
+
"dstack_submit_to_provision_duration_seconds",
|
|
10
|
+
"Time from when a run has been submitted and first job provisioning",
|
|
11
|
+
# Buckets optimized for percentile calculation
|
|
12
|
+
buckets=[
|
|
13
|
+
15,
|
|
14
|
+
30,
|
|
15
|
+
45,
|
|
16
|
+
60,
|
|
17
|
+
90,
|
|
18
|
+
120,
|
|
19
|
+
180,
|
|
20
|
+
240,
|
|
21
|
+
300,
|
|
22
|
+
360,
|
|
23
|
+
420,
|
|
24
|
+
480,
|
|
25
|
+
540,
|
|
26
|
+
600,
|
|
27
|
+
900,
|
|
28
|
+
1200,
|
|
29
|
+
1800,
|
|
30
|
+
float("inf"),
|
|
31
|
+
],
|
|
32
|
+
labelnames=["project_name", "run_type"],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
self._pending_runs_total = Counter(
|
|
36
|
+
"dstack_pending_runs_total",
|
|
37
|
+
"Number of pending runs",
|
|
38
|
+
labelnames=["project_name", "run_type"],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def log_submit_to_provision_duration(
|
|
42
|
+
self, duration_seconds: float, project_name: str, run_type: str
|
|
43
|
+
):
|
|
44
|
+
self._submit_to_provision_duration.labels(
|
|
45
|
+
project_name=project_name, run_type=run_type
|
|
46
|
+
).observe(duration_seconds)
|
|
47
|
+
|
|
48
|
+
def increment_pending_runs(self, project_name: str, run_type: str):
|
|
49
|
+
self._pending_runs_total.labels(project_name=project_name, run_type=run_type).inc()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
run_metrics = RunMetrics()
|
|
@@ -7,6 +7,7 @@ from sqlalchemy.orm import joinedload
|
|
|
7
7
|
|
|
8
8
|
import dstack._internal.server.services.jobs as jobs_services
|
|
9
9
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
10
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
10
11
|
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
11
12
|
from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
|
|
12
13
|
from dstack._internal.core.models.runs import (
|
|
@@ -86,6 +87,8 @@ class ServerProxyRepo(BaseProxyRepo):
|
|
|
86
87
|
username=jpd.username,
|
|
87
88
|
port=jpd.ssh_port,
|
|
88
89
|
)
|
|
90
|
+
if jpd.backend == BackendType.LOCAL:
|
|
91
|
+
ssh_proxy = None
|
|
89
92
|
ssh_head_proxy: Optional[SSHConnectionParams] = None
|
|
90
93
|
ssh_head_proxy_private_key: Optional[str] = None
|
|
91
94
|
instance = get_or_error(job.instance)
|
|
@@ -109,6 +109,14 @@ class RunnerClient:
|
|
|
109
109
|
)
|
|
110
110
|
resp.raise_for_status()
|
|
111
111
|
|
|
112
|
+
def upload_archive(self, id: uuid.UUID, file: Union[BinaryIO, bytes]):
|
|
113
|
+
resp = requests.post(
|
|
114
|
+
self._url("/api/upload_archive"),
|
|
115
|
+
files={"archive": (str(id), file)},
|
|
116
|
+
timeout=UPLOAD_CODE_REQUEST_TIMEOUT,
|
|
117
|
+
)
|
|
118
|
+
resp.raise_for_status()
|
|
119
|
+
|
|
112
120
|
def upload_code(self, file: Union[BinaryIO, bytes]):
|
|
113
121
|
resp = requests.post(
|
|
114
122
|
self._url("/api/upload_code"), data=file, timeout=UPLOAD_CODE_REQUEST_TIMEOUT
|