dstack 0.19.15rc1__py3-none-any.whl → 0.19.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/compatibility/logs.py +15 -0
- dstack/_internal/core/compatibility/runs.py +2 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +22 -2
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +10 -1
- dstack/_internal/server/background/tasks/process_fleets.py +1 -1
- dstack/_internal/server/background/tasks/process_gateways.py +1 -1
- dstack/_internal/server/background/tasks/process_instances.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_runs.py +21 -2
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/routers/gateways.py +6 -3
- dstack/_internal/server/routers/projects.py +63 -0
- dstack/_internal/server/routers/prometheus.py +5 -5
- dstack/_internal/server/schemas/logs.py +10 -1
- dstack/_internal/server/schemas/projects.py +12 -0
- dstack/_internal/server/security/permissions.py +75 -2
- dstack/_internal/server/services/fleets.py +1 -1
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/jobs/configurators/base.py +7 -1
- dstack/_internal/server/services/logs/aws.py +38 -38
- dstack/_internal/server/services/logs/filelog.py +48 -14
- dstack/_internal/server/services/logs/gcp.py +17 -16
- dstack/_internal/server/services/projects.py +164 -5
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
- dstack/_internal/server/services/runs.py +3 -3
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +1 -3
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-a4eafa74304e587d037c.js} +51 -43
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-a4eafa74304e587d037c.js.map} +1 -1
- dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-f53d6d0d42f8d61df1de.css} +1 -1
- dstack/_internal/settings.py +1 -0
- dstack/api/_public/runs.py +6 -5
- dstack/api/server/_logs.py +5 -1
- dstack/api/server/_projects.py +24 -0
- dstack/version.py +1 -1
- {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/METADATA +1 -1
- {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/RECORD +57 -48
- /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/WHEEL +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -40,7 +40,7 @@ async def process_submitted_gateways():
|
|
|
40
40
|
.options(lazyload(GatewayModel.gateway_compute))
|
|
41
41
|
.order_by(GatewayModel.last_processed_at.asc())
|
|
42
42
|
.limit(1)
|
|
43
|
-
.with_for_update(skip_locked=True)
|
|
43
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
44
44
|
)
|
|
45
45
|
gateway_model = res.scalar()
|
|
46
46
|
if gateway_model is None:
|
|
@@ -149,7 +149,7 @@ async def _process_next_instance():
|
|
|
149
149
|
.options(lazyload(InstanceModel.jobs))
|
|
150
150
|
.order_by(InstanceModel.last_processed_at.asc())
|
|
151
151
|
.limit(1)
|
|
152
|
-
.with_for_update(skip_locked=True)
|
|
152
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
153
153
|
)
|
|
154
154
|
instance = res.scalar()
|
|
155
155
|
if instance is None:
|
|
@@ -30,7 +30,7 @@ async def process_placement_groups():
|
|
|
30
30
|
PlacementGroupModel.id.not_in(lockset),
|
|
31
31
|
)
|
|
32
32
|
.order_by(PlacementGroupModel.id) # take locks in order
|
|
33
|
-
.with_for_update(skip_locked=True)
|
|
33
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
34
34
|
)
|
|
35
35
|
placement_group_models = res.scalars().all()
|
|
36
36
|
if len(placement_group_models) == 0:
|
|
@@ -101,7 +101,7 @@ async def _process_next_running_job():
|
|
|
101
101
|
)
|
|
102
102
|
.order_by(JobModel.last_processed_at.asc())
|
|
103
103
|
.limit(1)
|
|
104
|
-
.with_for_update(skip_locked=True)
|
|
104
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
105
105
|
)
|
|
106
106
|
job_model = res.unique().scalar()
|
|
107
107
|
if job_model is None:
|
|
@@ -27,6 +27,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
27
27
|
group_jobs_by_replica_latest,
|
|
28
28
|
)
|
|
29
29
|
from dstack._internal.server.services.locking import get_locker
|
|
30
|
+
from dstack._internal.server.services.prometheus.client_metrics import run_metrics
|
|
30
31
|
from dstack._internal.server.services.runs import (
|
|
31
32
|
fmt,
|
|
32
33
|
process_terminating_run,
|
|
@@ -62,7 +63,7 @@ async def _process_next_run():
|
|
|
62
63
|
)
|
|
63
64
|
.order_by(RunModel.last_processed_at.asc())
|
|
64
65
|
.limit(1)
|
|
65
|
-
.with_for_update(skip_locked=True)
|
|
66
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
66
67
|
)
|
|
67
68
|
run_model = res.scalar()
|
|
68
69
|
if run_model is None:
|
|
@@ -74,7 +75,7 @@ async def _process_next_run():
|
|
|
74
75
|
JobModel.id.not_in(job_lockset),
|
|
75
76
|
)
|
|
76
77
|
.order_by(JobModel.id) # take locks in order
|
|
77
|
-
.with_for_update(skip_locked=True)
|
|
78
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
78
79
|
)
|
|
79
80
|
job_models = res.scalars().all()
|
|
80
81
|
if len(run_model.jobs) != len(job_models):
|
|
@@ -329,6 +330,24 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
329
330
|
run_model.status.name,
|
|
330
331
|
new_status.name,
|
|
331
332
|
)
|
|
333
|
+
if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
|
|
334
|
+
current_time = common.get_current_datetime()
|
|
335
|
+
submit_to_provision_duration = (
|
|
336
|
+
current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
|
|
337
|
+
).total_seconds()
|
|
338
|
+
logger.info(
|
|
339
|
+
"%s: run took %.2f seconds from submision to provisioning.",
|
|
340
|
+
fmt(run_model),
|
|
341
|
+
submit_to_provision_duration,
|
|
342
|
+
)
|
|
343
|
+
project_name = run_model.project.name
|
|
344
|
+
run_metrics.log_submit_to_provision_duration(
|
|
345
|
+
submit_to_provision_duration, project_name, run_spec.configuration.type
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
if new_status == RunStatus.PENDING:
|
|
349
|
+
run_metrics.increment_pending_runs(run_model.project.name, run_spec.configuration.type)
|
|
350
|
+
|
|
332
351
|
run_model.status = new_status
|
|
333
352
|
run_model.termination_reason = termination_reason
|
|
334
353
|
# While a run goes to pending without provisioning, resubmission_attempt increases.
|
|
@@ -99,7 +99,7 @@ async def _process_next_submitted_job():
|
|
|
99
99
|
JobModel.id.not_in(lockset),
|
|
100
100
|
)
|
|
101
101
|
# Jobs are process in FIFO sorted by priority globally,
|
|
102
|
-
# thus runs from different
|
|
102
|
+
# thus runs from different projects can "overtake" each other by using higher priorities.
|
|
103
103
|
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
104
104
|
# Jobs with lower priorities from other projects will be processed without major lag
|
|
105
105
|
# as long as new higher priority runs are not constantly submitted.
|
|
@@ -108,7 +108,13 @@ async def _process_next_submitted_job():
|
|
|
108
108
|
# there can be many projects and we are limited by the max DB connections.
|
|
109
109
|
.order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
|
|
110
110
|
.limit(1)
|
|
111
|
-
.with_for_update(
|
|
111
|
+
.with_for_update(
|
|
112
|
+
skip_locked=True,
|
|
113
|
+
key_share=True,
|
|
114
|
+
# Do not lock joined run, only job.
|
|
115
|
+
# Locking run here may cause deadlock.
|
|
116
|
+
of=JobModel,
|
|
117
|
+
)
|
|
112
118
|
)
|
|
113
119
|
job_model = res.scalar()
|
|
114
120
|
if job_model is None:
|
|
@@ -201,7 +207,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
201
207
|
)
|
|
202
208
|
.options(lazyload(InstanceModel.jobs))
|
|
203
209
|
.order_by(InstanceModel.id) # take locks in order
|
|
204
|
-
.with_for_update()
|
|
210
|
+
.with_for_update(key_share=True)
|
|
205
211
|
)
|
|
206
212
|
pool_instances = list(res.unique().scalars().all())
|
|
207
213
|
instances_ids = sorted([i.id for i in pool_instances])
|
|
@@ -326,7 +332,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
326
332
|
.where(VolumeModel.id.in_(volumes_ids))
|
|
327
333
|
.options(selectinload(VolumeModel.user))
|
|
328
334
|
.order_by(VolumeModel.id) # take locks in order
|
|
329
|
-
.with_for_update()
|
|
335
|
+
.with_for_update(key_share=True)
|
|
330
336
|
)
|
|
331
337
|
async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
332
338
|
if len(volume_models) > 0:
|
|
@@ -45,7 +45,7 @@ async def _process_next_terminating_job():
|
|
|
45
45
|
)
|
|
46
46
|
.order_by(JobModel.last_processed_at.asc())
|
|
47
47
|
.limit(1)
|
|
48
|
-
.with_for_update(skip_locked=True)
|
|
48
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
49
49
|
)
|
|
50
50
|
job_model = res.scalar()
|
|
51
51
|
if job_model is None:
|
|
@@ -58,7 +58,7 @@ async def _process_next_terminating_job():
|
|
|
58
58
|
InstanceModel.id.not_in(instance_lockset),
|
|
59
59
|
)
|
|
60
60
|
.options(lazyload(InstanceModel.jobs))
|
|
61
|
-
.with_for_update(skip_locked=True)
|
|
61
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
62
62
|
)
|
|
63
63
|
instance_model = res.scalar()
|
|
64
64
|
if instance_model is None:
|
|
@@ -33,7 +33,7 @@ async def process_submitted_volumes():
|
|
|
33
33
|
)
|
|
34
34
|
.order_by(VolumeModel.last_processed_at.asc())
|
|
35
35
|
.limit(1)
|
|
36
|
-
.with_for_update(skip_locked=True)
|
|
36
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
37
37
|
)
|
|
38
38
|
volume_model = res.scalar()
|
|
39
39
|
if volume_model is None:
|
|
@@ -9,7 +9,10 @@ import dstack._internal.server.services.gateways as gateways
|
|
|
9
9
|
from dstack._internal.core.errors import ResourceNotExistsError
|
|
10
10
|
from dstack._internal.server.db import get_session
|
|
11
11
|
from dstack._internal.server.models import ProjectModel, UserModel
|
|
12
|
-
from dstack._internal.server.security.permissions import
|
|
12
|
+
from dstack._internal.server.security.permissions import (
|
|
13
|
+
ProjectAdmin,
|
|
14
|
+
ProjectMemberOrPublicAccess,
|
|
15
|
+
)
|
|
13
16
|
from dstack._internal.server.utils.routers import get_base_api_additional_responses
|
|
14
17
|
|
|
15
18
|
router = APIRouter(
|
|
@@ -22,7 +25,7 @@ router = APIRouter(
|
|
|
22
25
|
@router.post("/list")
|
|
23
26
|
async def list_gateways(
|
|
24
27
|
session: AsyncSession = Depends(get_session),
|
|
25
|
-
user_project: Tuple[UserModel, ProjectModel] = Depends(
|
|
28
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
|
|
26
29
|
) -> List[models.Gateway]:
|
|
27
30
|
_, project = user_project
|
|
28
31
|
return await gateways.list_project_gateways(session=session, project=project)
|
|
@@ -32,7 +35,7 @@ async def list_gateways(
|
|
|
32
35
|
async def get_gateway(
|
|
33
36
|
body: schemas.GetGatewayRequest,
|
|
34
37
|
session: AsyncSession = Depends(get_session),
|
|
35
|
-
user_project: Tuple[UserModel, ProjectModel] = Depends(
|
|
38
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
|
|
36
39
|
) -> models.Gateway:
|
|
37
40
|
_, project = user_project
|
|
38
41
|
gateway = await gateways.get_gateway_by_name(session=session, project=project, name=body.name)
|
|
@@ -7,13 +7,19 @@ from dstack._internal.core.models.projects import Project
|
|
|
7
7
|
from dstack._internal.server.db import get_session
|
|
8
8
|
from dstack._internal.server.models import ProjectModel, UserModel
|
|
9
9
|
from dstack._internal.server.schemas.projects import (
|
|
10
|
+
AddProjectMemberRequest,
|
|
10
11
|
CreateProjectRequest,
|
|
11
12
|
DeleteProjectsRequest,
|
|
13
|
+
RemoveProjectMemberRequest,
|
|
12
14
|
SetProjectMembersRequest,
|
|
15
|
+
UpdateProjectRequest,
|
|
13
16
|
)
|
|
14
17
|
from dstack._internal.server.security.permissions import (
|
|
15
18
|
Authenticated,
|
|
19
|
+
ProjectAdmin,
|
|
16
20
|
ProjectManager,
|
|
21
|
+
ProjectManagerOrPublicProject,
|
|
22
|
+
ProjectManagerOrSelfLeave,
|
|
17
23
|
ProjectMemberOrPublicAccess,
|
|
18
24
|
)
|
|
19
25
|
from dstack._internal.server.services import projects
|
|
@@ -92,3 +98,60 @@ async def set_project_members(
|
|
|
92
98
|
)
|
|
93
99
|
await session.refresh(project)
|
|
94
100
|
return projects.project_model_to_project(project)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@router.post(
|
|
104
|
+
"/{project_name}/add_members",
|
|
105
|
+
)
|
|
106
|
+
async def add_project_members(
|
|
107
|
+
body: AddProjectMemberRequest,
|
|
108
|
+
session: AsyncSession = Depends(get_session),
|
|
109
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManagerOrPublicProject()),
|
|
110
|
+
) -> Project:
|
|
111
|
+
user, project = user_project
|
|
112
|
+
await projects.add_project_members(
|
|
113
|
+
session=session,
|
|
114
|
+
user=user,
|
|
115
|
+
project=project,
|
|
116
|
+
members=body.members,
|
|
117
|
+
)
|
|
118
|
+
await session.refresh(project)
|
|
119
|
+
return projects.project_model_to_project(project)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@router.post(
|
|
123
|
+
"/{project_name}/remove_members",
|
|
124
|
+
)
|
|
125
|
+
async def remove_project_members(
|
|
126
|
+
body: RemoveProjectMemberRequest,
|
|
127
|
+
session: AsyncSession = Depends(get_session),
|
|
128
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManagerOrSelfLeave()),
|
|
129
|
+
) -> Project:
|
|
130
|
+
user, project = user_project
|
|
131
|
+
await projects.remove_project_members(
|
|
132
|
+
session=session,
|
|
133
|
+
user=user,
|
|
134
|
+
project=project,
|
|
135
|
+
usernames=body.usernames,
|
|
136
|
+
)
|
|
137
|
+
await session.refresh(project)
|
|
138
|
+
return projects.project_model_to_project(project)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@router.post(
|
|
142
|
+
"/{project_name}/update",
|
|
143
|
+
)
|
|
144
|
+
async def update_project(
|
|
145
|
+
body: UpdateProjectRequest,
|
|
146
|
+
session: AsyncSession = Depends(get_session),
|
|
147
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
|
|
148
|
+
) -> Project:
|
|
149
|
+
user, project = user_project
|
|
150
|
+
await projects.update_project(
|
|
151
|
+
session=session,
|
|
152
|
+
user=user,
|
|
153
|
+
project=project,
|
|
154
|
+
is_public=body.is_public,
|
|
155
|
+
)
|
|
156
|
+
await session.refresh(project)
|
|
157
|
+
return projects.project_model_to_project(project)
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Annotated
|
|
3
3
|
|
|
4
|
+
import prometheus_client
|
|
4
5
|
from fastapi import APIRouter, Depends
|
|
5
6
|
from fastapi.responses import PlainTextResponse
|
|
6
|
-
from prometheus_client import generate_latest
|
|
7
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
8
|
|
|
9
9
|
from dstack._internal.server import settings
|
|
10
10
|
from dstack._internal.server.db import get_session
|
|
11
11
|
from dstack._internal.server.security.permissions import OptionalServiceAccount
|
|
12
|
-
from dstack._internal.server.services import
|
|
12
|
+
from dstack._internal.server.services.prometheus import custom_metrics
|
|
13
13
|
from dstack._internal.server.utils.routers import error_not_found
|
|
14
14
|
|
|
15
15
|
_auth = OptionalServiceAccount(os.getenv("DSTACK_PROMETHEUS_AUTH_TOKEN"))
|
|
@@ -27,6 +27,6 @@ async def get_prometheus_metrics(
|
|
|
27
27
|
) -> str:
|
|
28
28
|
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
29
29
|
raise error_not_found()
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
return
|
|
30
|
+
custom_metrics_ = await custom_metrics.get_metrics(session=session)
|
|
31
|
+
client_metrics = prometheus_client.generate_latest().decode()
|
|
32
|
+
return custom_metrics_ + client_metrics
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from pydantic import UUID4, Field
|
|
4
|
+
from pydantic import UUID4, Field, validator
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
@@ -12,5 +12,14 @@ class PollLogsRequest(CoreModel):
|
|
|
12
12
|
start_time: Optional[datetime]
|
|
13
13
|
end_time: Optional[datetime]
|
|
14
14
|
descending: bool = False
|
|
15
|
+
next_token: Optional[str] = None
|
|
15
16
|
limit: int = Field(100, ge=0, le=1000)
|
|
16
17
|
diagnose: bool = False
|
|
18
|
+
|
|
19
|
+
@validator("descending")
|
|
20
|
+
@classmethod
|
|
21
|
+
def validate_descending(cls, v):
|
|
22
|
+
# Descending is not supported until we migrate from base64-encoded logs to plain text logs.
|
|
23
|
+
if v is True:
|
|
24
|
+
raise ValueError("descending: true is not supported")
|
|
25
|
+
return v
|
|
@@ -11,6 +11,10 @@ class CreateProjectRequest(CoreModel):
|
|
|
11
11
|
is_public: bool = False
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
class UpdateProjectRequest(CoreModel):
|
|
15
|
+
is_public: bool
|
|
16
|
+
|
|
17
|
+
|
|
14
18
|
class DeleteProjectsRequest(CoreModel):
|
|
15
19
|
projects_names: List[str]
|
|
16
20
|
|
|
@@ -25,3 +29,11 @@ class MemberSetting(CoreModel):
|
|
|
25
29
|
|
|
26
30
|
class SetProjectMembersRequest(CoreModel):
|
|
27
31
|
members: List[MemberSetting]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AddProjectMemberRequest(CoreModel):
|
|
35
|
+
members: List[MemberSetting]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class RemoveProjectMemberRequest(CoreModel):
|
|
39
|
+
usernames: List[str]
|
|
@@ -58,7 +58,7 @@ class ProjectAdmin:
|
|
|
58
58
|
raise error_invalid_token()
|
|
59
59
|
project = await get_project_model_by_name(session=session, project_name=project_name)
|
|
60
60
|
if project is None:
|
|
61
|
-
raise
|
|
61
|
+
raise error_not_found()
|
|
62
62
|
if user.global_role == GlobalRole.ADMIN:
|
|
63
63
|
return user, project
|
|
64
64
|
project_role = get_user_project_role(user=user, project=project)
|
|
@@ -68,6 +68,10 @@ class ProjectAdmin:
|
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
class ProjectManager:
|
|
71
|
+
"""
|
|
72
|
+
Allows project admins and managers to manage projects.
|
|
73
|
+
"""
|
|
74
|
+
|
|
71
75
|
async def __call__(
|
|
72
76
|
self,
|
|
73
77
|
project_name: str,
|
|
@@ -79,12 +83,15 @@ class ProjectManager:
|
|
|
79
83
|
raise error_invalid_token()
|
|
80
84
|
project = await get_project_model_by_name(session=session, project_name=project_name)
|
|
81
85
|
if project is None:
|
|
82
|
-
raise
|
|
86
|
+
raise error_not_found()
|
|
87
|
+
|
|
83
88
|
if user.global_role == GlobalRole.ADMIN:
|
|
84
89
|
return user, project
|
|
90
|
+
|
|
85
91
|
project_role = get_user_project_role(user=user, project=project)
|
|
86
92
|
if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
|
|
87
93
|
return user, project
|
|
94
|
+
|
|
88
95
|
raise error_forbidden()
|
|
89
96
|
|
|
90
97
|
|
|
@@ -135,6 +142,72 @@ class ProjectMemberOrPublicAccess:
|
|
|
135
142
|
raise error_forbidden()
|
|
136
143
|
|
|
137
144
|
|
|
145
|
+
class ProjectManagerOrPublicProject:
|
|
146
|
+
"""
|
|
147
|
+
Allows:
|
|
148
|
+
1. Project managers to perform member management operations
|
|
149
|
+
2. Access to public projects for any authenticated user
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(self):
|
|
153
|
+
self.project_manager = ProjectManager()
|
|
154
|
+
|
|
155
|
+
async def __call__(
|
|
156
|
+
self,
|
|
157
|
+
project_name: str,
|
|
158
|
+
session: AsyncSession = Depends(get_session),
|
|
159
|
+
token: HTTPAuthorizationCredentials = Security(HTTPBearer()),
|
|
160
|
+
) -> Tuple[UserModel, ProjectModel]:
|
|
161
|
+
user = await log_in_with_token(session=session, token=token.credentials)
|
|
162
|
+
if user is None:
|
|
163
|
+
raise error_invalid_token()
|
|
164
|
+
project = await get_project_model_by_name(session=session, project_name=project_name)
|
|
165
|
+
if project is None:
|
|
166
|
+
raise error_not_found()
|
|
167
|
+
|
|
168
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
169
|
+
return user, project
|
|
170
|
+
|
|
171
|
+
project_role = get_user_project_role(user=user, project=project)
|
|
172
|
+
if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
|
|
173
|
+
return user, project
|
|
174
|
+
|
|
175
|
+
if project.is_public:
|
|
176
|
+
return user, project
|
|
177
|
+
|
|
178
|
+
raise error_forbidden()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class ProjectManagerOrSelfLeave:
|
|
182
|
+
"""
|
|
183
|
+
Allows:
|
|
184
|
+
1. Project managers to remove any members
|
|
185
|
+
2. Any project member to leave (remove themselves)
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
async def __call__(
|
|
189
|
+
self,
|
|
190
|
+
project_name: str,
|
|
191
|
+
session: AsyncSession = Depends(get_session),
|
|
192
|
+
token: HTTPAuthorizationCredentials = Security(HTTPBearer()),
|
|
193
|
+
) -> Tuple[UserModel, ProjectModel]:
|
|
194
|
+
user = await log_in_with_token(session=session, token=token.credentials)
|
|
195
|
+
if user is None:
|
|
196
|
+
raise error_invalid_token()
|
|
197
|
+
project = await get_project_model_by_name(session=session, project_name=project_name)
|
|
198
|
+
if project is None:
|
|
199
|
+
raise error_not_found()
|
|
200
|
+
|
|
201
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
202
|
+
return user, project
|
|
203
|
+
|
|
204
|
+
project_role = get_user_project_role(user=user, project=project)
|
|
205
|
+
if project_role is not None:
|
|
206
|
+
return user, project
|
|
207
|
+
|
|
208
|
+
raise error_forbidden()
|
|
209
|
+
|
|
210
|
+
|
|
138
211
|
class OptionalServiceAccount:
|
|
139
212
|
def __init__(self, token: Optional[str]) -> None:
|
|
140
213
|
self._token = token
|
|
@@ -532,7 +532,7 @@ async def delete_fleets(
|
|
|
532
532
|
.options(selectinload(FleetModel.runs))
|
|
533
533
|
.execution_options(populate_existing=True)
|
|
534
534
|
.order_by(FleetModel.id) # take locks in order
|
|
535
|
-
.with_for_update()
|
|
535
|
+
.with_for_update(key_share=True)
|
|
536
536
|
)
|
|
537
537
|
fleet_models = res.scalars().unique().all()
|
|
538
538
|
fleets = [fleet_model_to_fleet(m) for m in fleet_models]
|
|
@@ -240,7 +240,7 @@ async def delete_gateways(
|
|
|
240
240
|
.options(selectinload(GatewayModel.gateway_compute))
|
|
241
241
|
.execution_options(populate_existing=True)
|
|
242
242
|
.order_by(GatewayModel.id) # take locks in order
|
|
243
|
-
.with_for_update()
|
|
243
|
+
.with_for_update(key_share=True)
|
|
244
244
|
)
|
|
245
245
|
gateway_models = res.scalars().all()
|
|
246
246
|
for gateway_model in gateway_models:
|
|
@@ -171,6 +171,8 @@ class JobConfigurator(ABC):
|
|
|
171
171
|
return result
|
|
172
172
|
|
|
173
173
|
def _dstack_image_commands(self) -> List[str]:
|
|
174
|
+
if self.run_spec.configuration.docker is True:
|
|
175
|
+
return ["start-dockerd"]
|
|
174
176
|
if (
|
|
175
177
|
self.run_spec.configuration.image is not None
|
|
176
178
|
or self.run_spec.configuration.entrypoint is not None
|
|
@@ -201,7 +203,9 @@ class JobConfigurator(ABC):
|
|
|
201
203
|
return self.run_spec.configuration.home_dir
|
|
202
204
|
|
|
203
205
|
def _image_name(self) -> str:
|
|
204
|
-
if self.run_spec.configuration.
|
|
206
|
+
if self.run_spec.configuration.docker is True:
|
|
207
|
+
return settings.DSTACK_DIND_IMAGE
|
|
208
|
+
elif self.run_spec.configuration.image is not None:
|
|
205
209
|
return self.run_spec.configuration.image
|
|
206
210
|
return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc))
|
|
207
211
|
|
|
@@ -215,6 +219,8 @@ class JobConfigurator(ABC):
|
|
|
215
219
|
return UnixUser.parse(user)
|
|
216
220
|
|
|
217
221
|
def _privileged(self) -> bool:
|
|
222
|
+
if self.run_spec.configuration.docker is True:
|
|
223
|
+
return True
|
|
218
224
|
return self.run_spec.configuration.privileged
|
|
219
225
|
|
|
220
226
|
def _single_branch(self) -> bool:
|
|
@@ -78,14 +78,22 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
78
78
|
project.name, request.run_name, request.job_submission_id, log_producer
|
|
79
79
|
)
|
|
80
80
|
cw_events: List[_CloudWatchLogEvent]
|
|
81
|
+
next_token: Optional[str] = None
|
|
81
82
|
with self._wrap_boto_errors():
|
|
82
83
|
try:
|
|
83
|
-
cw_events = self._get_log_events(stream, request)
|
|
84
|
+
cw_events, next_token = self._get_log_events(stream, request)
|
|
84
85
|
except botocore.exceptions.ClientError as e:
|
|
85
86
|
if not self._is_resource_not_found_exception(e):
|
|
86
87
|
raise
|
|
87
|
-
|
|
88
|
-
|
|
88
|
+
# Check if the group exists to distinguish between group not found vs stream not found
|
|
89
|
+
try:
|
|
90
|
+
self._check_group_exists(self._group)
|
|
91
|
+
# Group exists, so the error must be due to missing stream
|
|
92
|
+
logger.debug("Stream %s not found, returning dummy response", stream)
|
|
93
|
+
cw_events = []
|
|
94
|
+
except LogStorageError:
|
|
95
|
+
# Group doesn't exist, re-raise the LogStorageError
|
|
96
|
+
raise
|
|
89
97
|
logs = [
|
|
90
98
|
LogEvent(
|
|
91
99
|
timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
|
|
@@ -94,51 +102,43 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
94
102
|
)
|
|
95
103
|
for cw_event in cw_events
|
|
96
104
|
]
|
|
97
|
-
return JobSubmissionLogs(logs=logs)
|
|
105
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
|
|
98
106
|
|
|
99
|
-
def _get_log_events(
|
|
100
|
-
|
|
107
|
+
def _get_log_events(
|
|
108
|
+
self, stream: str, request: PollLogsRequest
|
|
109
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
|
|
110
|
+
start_from_head = not request.descending
|
|
101
111
|
parameters = {
|
|
102
112
|
"logGroupName": self._group,
|
|
103
113
|
"logStreamName": stream,
|
|
104
|
-
"limit": limit,
|
|
114
|
+
"limit": request.limit,
|
|
115
|
+
"startFromHead": start_from_head,
|
|
105
116
|
}
|
|
106
|
-
|
|
107
|
-
parameters["startFromHead"] = start_from_head
|
|
117
|
+
|
|
108
118
|
if request.start_time:
|
|
109
|
-
# XXX: Since callers use start_time/end_time for pagination, one millisecond is added
|
|
110
|
-
# to avoid an infinite loop because startTime boundary is inclusive.
|
|
111
119
|
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + 1
|
|
120
|
+
|
|
112
121
|
if request.end_time:
|
|
113
|
-
# No need to substract one millisecond in this case, though, seems that endTime is
|
|
114
|
-
# exclusive, that is, time interval boundaries are [startTime, entTime)
|
|
115
122
|
parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
elif start_from_head:
|
|
124
|
+
# When startFromHead=true and no endTime is provided, set endTime to "now"
|
|
125
|
+
# to prevent infinite pagination as new logs arrive faster than we can read them
|
|
126
|
+
parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
|
|
127
|
+
|
|
128
|
+
if request.next_token:
|
|
129
|
+
parameters["nextToken"] = request.next_token
|
|
130
|
+
|
|
131
|
+
response = self._client.get_log_events(**parameters)
|
|
132
|
+
|
|
133
|
+
events = response.get("events", [])
|
|
121
134
|
next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
events.extend(response["events"])
|
|
130
|
-
else:
|
|
131
|
-
# Regardless of the startFromHead value log events are arranged in
|
|
132
|
-
# chronological order, from earliest to latest.
|
|
133
|
-
events.extend(reversed(response["events"]))
|
|
134
|
-
if len(events) >= limit:
|
|
135
|
-
return events[:limit]
|
|
136
|
-
if response[next_token_key] == next_token:
|
|
137
|
-
return events
|
|
138
|
-
next_token = response[next_token_key]
|
|
139
|
-
tries_left -= 1
|
|
140
|
-
logger.warning("too many requests to stream %s, returning partial response", stream)
|
|
141
|
-
return events
|
|
135
|
+
next_token = response.get(next_token_key)
|
|
136
|
+
|
|
137
|
+
# TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
|
|
138
|
+
if request.descending:
|
|
139
|
+
events = list(reversed(events))
|
|
140
|
+
|
|
141
|
+
return events, next_token
|
|
142
142
|
|
|
143
143
|
def write_logs(
|
|
144
144
|
self,
|