dstack 0.19.15rc1__py3-none-any.whl → 0.19.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (57) hide show
  1. dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
  2. dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
  3. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  4. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  5. dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
  6. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  7. dstack/_internal/core/backends/configurators.py +9 -0
  8. dstack/_internal/core/backends/models.py +7 -0
  9. dstack/_internal/core/compatibility/logs.py +15 -0
  10. dstack/_internal/core/compatibility/runs.py +2 -0
  11. dstack/_internal/core/models/backends/base.py +2 -0
  12. dstack/_internal/core/models/configurations.py +22 -2
  13. dstack/_internal/core/models/logs.py +2 -1
  14. dstack/_internal/core/models/runs.py +10 -1
  15. dstack/_internal/server/background/tasks/process_fleets.py +1 -1
  16. dstack/_internal/server/background/tasks/process_gateways.py +1 -1
  17. dstack/_internal/server/background/tasks/process_instances.py +1 -1
  18. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  19. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
  20. dstack/_internal/server/background/tasks/process_runs.py +21 -2
  21. dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
  22. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  23. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  24. dstack/_internal/server/routers/gateways.py +6 -3
  25. dstack/_internal/server/routers/projects.py +63 -0
  26. dstack/_internal/server/routers/prometheus.py +5 -5
  27. dstack/_internal/server/schemas/logs.py +10 -1
  28. dstack/_internal/server/schemas/projects.py +12 -0
  29. dstack/_internal/server/security/permissions.py +75 -2
  30. dstack/_internal/server/services/fleets.py +1 -1
  31. dstack/_internal/server/services/gateways/__init__.py +1 -1
  32. dstack/_internal/server/services/jobs/configurators/base.py +7 -1
  33. dstack/_internal/server/services/logs/aws.py +38 -38
  34. dstack/_internal/server/services/logs/filelog.py +48 -14
  35. dstack/_internal/server/services/logs/gcp.py +17 -16
  36. dstack/_internal/server/services/projects.py +164 -5
  37. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  38. dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
  39. dstack/_internal/server/services/runs.py +3 -3
  40. dstack/_internal/server/services/services/__init__.py +2 -1
  41. dstack/_internal/server/services/users.py +1 -3
  42. dstack/_internal/server/services/volumes.py +1 -1
  43. dstack/_internal/server/statics/index.html +1 -1
  44. dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-a4eafa74304e587d037c.js} +51 -43
  45. dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-a4eafa74304e587d037c.js.map} +1 -1
  46. dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-f53d6d0d42f8d61df1de.css} +1 -1
  47. dstack/_internal/settings.py +1 -0
  48. dstack/api/_public/runs.py +6 -5
  49. dstack/api/server/_logs.py +5 -1
  50. dstack/api/server/_projects.py +24 -0
  51. dstack/version.py +1 -1
  52. {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/METADATA +1 -1
  53. {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/RECORD +57 -48
  54. /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
  55. {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/WHEEL +0 -0
  56. {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/entry_points.txt +0 -0
  57. {dstack-0.19.15rc1.dist-info → dstack-0.19.16.dist-info}/licenses/LICENSE.md +0 -0
@@ -40,7 +40,7 @@ async def process_submitted_gateways():
40
40
  .options(lazyload(GatewayModel.gateway_compute))
41
41
  .order_by(GatewayModel.last_processed_at.asc())
42
42
  .limit(1)
43
- .with_for_update(skip_locked=True)
43
+ .with_for_update(skip_locked=True, key_share=True)
44
44
  )
45
45
  gateway_model = res.scalar()
46
46
  if gateway_model is None:
@@ -149,7 +149,7 @@ async def _process_next_instance():
149
149
  .options(lazyload(InstanceModel.jobs))
150
150
  .order_by(InstanceModel.last_processed_at.asc())
151
151
  .limit(1)
152
- .with_for_update(skip_locked=True)
152
+ .with_for_update(skip_locked=True, key_share=True)
153
153
  )
154
154
  instance = res.scalar()
155
155
  if instance is None:
@@ -30,7 +30,7 @@ async def process_placement_groups():
30
30
  PlacementGroupModel.id.not_in(lockset),
31
31
  )
32
32
  .order_by(PlacementGroupModel.id) # take locks in order
33
- .with_for_update(skip_locked=True)
33
+ .with_for_update(skip_locked=True, key_share=True)
34
34
  )
35
35
  placement_group_models = res.scalars().all()
36
36
  if len(placement_group_models) == 0:
@@ -101,7 +101,7 @@ async def _process_next_running_job():
101
101
  )
102
102
  .order_by(JobModel.last_processed_at.asc())
103
103
  .limit(1)
104
- .with_for_update(skip_locked=True)
104
+ .with_for_update(skip_locked=True, key_share=True)
105
105
  )
106
106
  job_model = res.unique().scalar()
107
107
  if job_model is None:
@@ -27,6 +27,7 @@ from dstack._internal.server.services.jobs import (
27
27
  group_jobs_by_replica_latest,
28
28
  )
29
29
  from dstack._internal.server.services.locking import get_locker
30
+ from dstack._internal.server.services.prometheus.client_metrics import run_metrics
30
31
  from dstack._internal.server.services.runs import (
31
32
  fmt,
32
33
  process_terminating_run,
@@ -62,7 +63,7 @@ async def _process_next_run():
62
63
  )
63
64
  .order_by(RunModel.last_processed_at.asc())
64
65
  .limit(1)
65
- .with_for_update(skip_locked=True)
66
+ .with_for_update(skip_locked=True, key_share=True)
66
67
  )
67
68
  run_model = res.scalar()
68
69
  if run_model is None:
@@ -74,7 +75,7 @@ async def _process_next_run():
74
75
  JobModel.id.not_in(job_lockset),
75
76
  )
76
77
  .order_by(JobModel.id) # take locks in order
77
- .with_for_update(skip_locked=True)
78
+ .with_for_update(skip_locked=True, key_share=True)
78
79
  )
79
80
  job_models = res.scalars().all()
80
81
  if len(run_model.jobs) != len(job_models):
@@ -329,6 +330,24 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
329
330
  run_model.status.name,
330
331
  new_status.name,
331
332
  )
333
+ if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
334
+ current_time = common.get_current_datetime()
335
+ submit_to_provision_duration = (
336
+ current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
337
+ ).total_seconds()
338
+ logger.info(
339
+ "%s: run took %.2f seconds from submision to provisioning.",
340
+ fmt(run_model),
341
+ submit_to_provision_duration,
342
+ )
343
+ project_name = run_model.project.name
344
+ run_metrics.log_submit_to_provision_duration(
345
+ submit_to_provision_duration, project_name, run_spec.configuration.type
346
+ )
347
+
348
+ if new_status == RunStatus.PENDING:
349
+ run_metrics.increment_pending_runs(run_model.project.name, run_spec.configuration.type)
350
+
332
351
  run_model.status = new_status
333
352
  run_model.termination_reason = termination_reason
334
353
  # While a run goes to pending without provisioning, resubmission_attempt increases.
@@ -99,7 +99,7 @@ async def _process_next_submitted_job():
99
99
  JobModel.id.not_in(lockset),
100
100
  )
101
101
  # Jobs are process in FIFO sorted by priority globally,
102
- # thus runs from different project can "overtake" each other by using higher priorities.
102
+ # thus runs from different projects can "overtake" each other by using higher priorities.
103
103
  # That's not a big problem as long as projects do not compete for the same compute resources.
104
104
  # Jobs with lower priorities from other projects will be processed without major lag
105
105
  # as long as new higher priority runs are not constantly submitted.
@@ -108,7 +108,13 @@ async def _process_next_submitted_job():
108
108
  # there can be many projects and we are limited by the max DB connections.
109
109
  .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
110
110
  .limit(1)
111
- .with_for_update(skip_locked=True)
111
+ .with_for_update(
112
+ skip_locked=True,
113
+ key_share=True,
114
+ # Do not lock joined run, only job.
115
+ # Locking run here may cause deadlock.
116
+ of=JobModel,
117
+ )
112
118
  )
113
119
  job_model = res.scalar()
114
120
  if job_model is None:
@@ -201,7 +207,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
201
207
  )
202
208
  .options(lazyload(InstanceModel.jobs))
203
209
  .order_by(InstanceModel.id) # take locks in order
204
- .with_for_update()
210
+ .with_for_update(key_share=True)
205
211
  )
206
212
  pool_instances = list(res.unique().scalars().all())
207
213
  instances_ids = sorted([i.id for i in pool_instances])
@@ -326,7 +332,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
326
332
  .where(VolumeModel.id.in_(volumes_ids))
327
333
  .options(selectinload(VolumeModel.user))
328
334
  .order_by(VolumeModel.id) # take locks in order
329
- .with_for_update()
335
+ .with_for_update(key_share=True)
330
336
  )
331
337
  async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
332
338
  if len(volume_models) > 0:
@@ -45,7 +45,7 @@ async def _process_next_terminating_job():
45
45
  )
46
46
  .order_by(JobModel.last_processed_at.asc())
47
47
  .limit(1)
48
- .with_for_update(skip_locked=True)
48
+ .with_for_update(skip_locked=True, key_share=True)
49
49
  )
50
50
  job_model = res.scalar()
51
51
  if job_model is None:
@@ -58,7 +58,7 @@ async def _process_next_terminating_job():
58
58
  InstanceModel.id.not_in(instance_lockset),
59
59
  )
60
60
  .options(lazyload(InstanceModel.jobs))
61
- .with_for_update(skip_locked=True)
61
+ .with_for_update(skip_locked=True, key_share=True)
62
62
  )
63
63
  instance_model = res.scalar()
64
64
  if instance_model is None:
@@ -33,7 +33,7 @@ async def process_submitted_volumes():
33
33
  )
34
34
  .order_by(VolumeModel.last_processed_at.asc())
35
35
  .limit(1)
36
- .with_for_update(skip_locked=True)
36
+ .with_for_update(skip_locked=True, key_share=True)
37
37
  )
38
38
  volume_model = res.scalar()
39
39
  if volume_model is None:
@@ -9,7 +9,10 @@ import dstack._internal.server.services.gateways as gateways
9
9
  from dstack._internal.core.errors import ResourceNotExistsError
10
10
  from dstack._internal.server.db import get_session
11
11
  from dstack._internal.server.models import ProjectModel, UserModel
12
- from dstack._internal.server.security.permissions import ProjectAdmin, ProjectMember
12
+ from dstack._internal.server.security.permissions import (
13
+ ProjectAdmin,
14
+ ProjectMemberOrPublicAccess,
15
+ )
13
16
  from dstack._internal.server.utils.routers import get_base_api_additional_responses
14
17
 
15
18
  router = APIRouter(
@@ -22,7 +25,7 @@ router = APIRouter(
22
25
  @router.post("/list")
23
26
  async def list_gateways(
24
27
  session: AsyncSession = Depends(get_session),
25
- user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
28
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
26
29
  ) -> List[models.Gateway]:
27
30
  _, project = user_project
28
31
  return await gateways.list_project_gateways(session=session, project=project)
@@ -32,7 +35,7 @@ async def list_gateways(
32
35
  async def get_gateway(
33
36
  body: schemas.GetGatewayRequest,
34
37
  session: AsyncSession = Depends(get_session),
35
- user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
38
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
36
39
  ) -> models.Gateway:
37
40
  _, project = user_project
38
41
  gateway = await gateways.get_gateway_by_name(session=session, project=project, name=body.name)
@@ -7,13 +7,19 @@ from dstack._internal.core.models.projects import Project
7
7
  from dstack._internal.server.db import get_session
8
8
  from dstack._internal.server.models import ProjectModel, UserModel
9
9
  from dstack._internal.server.schemas.projects import (
10
+ AddProjectMemberRequest,
10
11
  CreateProjectRequest,
11
12
  DeleteProjectsRequest,
13
+ RemoveProjectMemberRequest,
12
14
  SetProjectMembersRequest,
15
+ UpdateProjectRequest,
13
16
  )
14
17
  from dstack._internal.server.security.permissions import (
15
18
  Authenticated,
19
+ ProjectAdmin,
16
20
  ProjectManager,
21
+ ProjectManagerOrPublicProject,
22
+ ProjectManagerOrSelfLeave,
17
23
  ProjectMemberOrPublicAccess,
18
24
  )
19
25
  from dstack._internal.server.services import projects
@@ -92,3 +98,60 @@ async def set_project_members(
92
98
  )
93
99
  await session.refresh(project)
94
100
  return projects.project_model_to_project(project)
101
+
102
+
103
+ @router.post(
104
+ "/{project_name}/add_members",
105
+ )
106
+ async def add_project_members(
107
+ body: AddProjectMemberRequest,
108
+ session: AsyncSession = Depends(get_session),
109
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManagerOrPublicProject()),
110
+ ) -> Project:
111
+ user, project = user_project
112
+ await projects.add_project_members(
113
+ session=session,
114
+ user=user,
115
+ project=project,
116
+ members=body.members,
117
+ )
118
+ await session.refresh(project)
119
+ return projects.project_model_to_project(project)
120
+
121
+
122
+ @router.post(
123
+ "/{project_name}/remove_members",
124
+ )
125
+ async def remove_project_members(
126
+ body: RemoveProjectMemberRequest,
127
+ session: AsyncSession = Depends(get_session),
128
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectManagerOrSelfLeave()),
129
+ ) -> Project:
130
+ user, project = user_project
131
+ await projects.remove_project_members(
132
+ session=session,
133
+ user=user,
134
+ project=project,
135
+ usernames=body.usernames,
136
+ )
137
+ await session.refresh(project)
138
+ return projects.project_model_to_project(project)
139
+
140
+
141
+ @router.post(
142
+ "/{project_name}/update",
143
+ )
144
+ async def update_project(
145
+ body: UpdateProjectRequest,
146
+ session: AsyncSession = Depends(get_session),
147
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
148
+ ) -> Project:
149
+ user, project = user_project
150
+ await projects.update_project(
151
+ session=session,
152
+ user=user,
153
+ project=project,
154
+ is_public=body.is_public,
155
+ )
156
+ await session.refresh(project)
157
+ return projects.project_model_to_project(project)
@@ -1,15 +1,15 @@
1
1
  import os
2
2
  from typing import Annotated
3
3
 
4
+ import prometheus_client
4
5
  from fastapi import APIRouter, Depends
5
6
  from fastapi.responses import PlainTextResponse
6
- from prometheus_client import generate_latest
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
8
 
9
9
  from dstack._internal.server import settings
10
10
  from dstack._internal.server.db import get_session
11
11
  from dstack._internal.server.security.permissions import OptionalServiceAccount
12
- from dstack._internal.server.services import prometheus
12
+ from dstack._internal.server.services.prometheus import custom_metrics
13
13
  from dstack._internal.server.utils.routers import error_not_found
14
14
 
15
15
  _auth = OptionalServiceAccount(os.getenv("DSTACK_PROMETHEUS_AUTH_TOKEN"))
@@ -27,6 +27,6 @@ async def get_prometheus_metrics(
27
27
  ) -> str:
28
28
  if not settings.ENABLE_PROMETHEUS_METRICS:
29
29
  raise error_not_found()
30
- custom_metrics = await prometheus.get_metrics(session=session)
31
- prometheus_metrics = generate_latest()
32
- return custom_metrics + prometheus_metrics.decode()
30
+ custom_metrics_ = await custom_metrics.get_metrics(session=session)
31
+ client_metrics = prometheus_client.generate_latest().decode()
32
+ return custom_metrics_ + client_metrics
@@ -1,7 +1,7 @@
1
1
  from datetime import datetime
2
2
  from typing import Optional
3
3
 
4
- from pydantic import UUID4, Field
4
+ from pydantic import UUID4, Field, validator
5
5
 
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
@@ -12,5 +12,14 @@ class PollLogsRequest(CoreModel):
12
12
  start_time: Optional[datetime]
13
13
  end_time: Optional[datetime]
14
14
  descending: bool = False
15
+ next_token: Optional[str] = None
15
16
  limit: int = Field(100, ge=0, le=1000)
16
17
  diagnose: bool = False
18
+
19
+ @validator("descending")
20
+ @classmethod
21
+ def validate_descending(cls, v):
22
+ # Descending is not supported until we migrate from base64-encoded logs to plain text logs.
23
+ if v is True:
24
+ raise ValueError("descending: true is not supported")
25
+ return v
@@ -11,6 +11,10 @@ class CreateProjectRequest(CoreModel):
11
11
  is_public: bool = False
12
12
 
13
13
 
14
+ class UpdateProjectRequest(CoreModel):
15
+ is_public: bool
16
+
17
+
14
18
  class DeleteProjectsRequest(CoreModel):
15
19
  projects_names: List[str]
16
20
 
@@ -25,3 +29,11 @@ class MemberSetting(CoreModel):
25
29
 
26
30
  class SetProjectMembersRequest(CoreModel):
27
31
  members: List[MemberSetting]
32
+
33
+
34
+ class AddProjectMemberRequest(CoreModel):
35
+ members: List[MemberSetting]
36
+
37
+
38
+ class RemoveProjectMemberRequest(CoreModel):
39
+ usernames: List[str]
@@ -58,7 +58,7 @@ class ProjectAdmin:
58
58
  raise error_invalid_token()
59
59
  project = await get_project_model_by_name(session=session, project_name=project_name)
60
60
  if project is None:
61
- raise error_forbidden()
61
+ raise error_not_found()
62
62
  if user.global_role == GlobalRole.ADMIN:
63
63
  return user, project
64
64
  project_role = get_user_project_role(user=user, project=project)
@@ -68,6 +68,10 @@ class ProjectAdmin:
68
68
 
69
69
 
70
70
  class ProjectManager:
71
+ """
72
+ Allows project admins and managers to manage projects.
73
+ """
74
+
71
75
  async def __call__(
72
76
  self,
73
77
  project_name: str,
@@ -79,12 +83,15 @@ class ProjectManager:
79
83
  raise error_invalid_token()
80
84
  project = await get_project_model_by_name(session=session, project_name=project_name)
81
85
  if project is None:
82
- raise error_forbidden()
86
+ raise error_not_found()
87
+
83
88
  if user.global_role == GlobalRole.ADMIN:
84
89
  return user, project
90
+
85
91
  project_role = get_user_project_role(user=user, project=project)
86
92
  if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
87
93
  return user, project
94
+
88
95
  raise error_forbidden()
89
96
 
90
97
 
@@ -135,6 +142,72 @@ class ProjectMemberOrPublicAccess:
135
142
  raise error_forbidden()
136
143
 
137
144
 
145
+ class ProjectManagerOrPublicProject:
146
+ """
147
+ Allows:
148
+ 1. Project managers to perform member management operations
149
+ 2. Access to public projects for any authenticated user
150
+ """
151
+
152
+ def __init__(self):
153
+ self.project_manager = ProjectManager()
154
+
155
+ async def __call__(
156
+ self,
157
+ project_name: str,
158
+ session: AsyncSession = Depends(get_session),
159
+ token: HTTPAuthorizationCredentials = Security(HTTPBearer()),
160
+ ) -> Tuple[UserModel, ProjectModel]:
161
+ user = await log_in_with_token(session=session, token=token.credentials)
162
+ if user is None:
163
+ raise error_invalid_token()
164
+ project = await get_project_model_by_name(session=session, project_name=project_name)
165
+ if project is None:
166
+ raise error_not_found()
167
+
168
+ if user.global_role == GlobalRole.ADMIN:
169
+ return user, project
170
+
171
+ project_role = get_user_project_role(user=user, project=project)
172
+ if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
173
+ return user, project
174
+
175
+ if project.is_public:
176
+ return user, project
177
+
178
+ raise error_forbidden()
179
+
180
+
181
+ class ProjectManagerOrSelfLeave:
182
+ """
183
+ Allows:
184
+ 1. Project managers to remove any members
185
+ 2. Any project member to leave (remove themselves)
186
+ """
187
+
188
+ async def __call__(
189
+ self,
190
+ project_name: str,
191
+ session: AsyncSession = Depends(get_session),
192
+ token: HTTPAuthorizationCredentials = Security(HTTPBearer()),
193
+ ) -> Tuple[UserModel, ProjectModel]:
194
+ user = await log_in_with_token(session=session, token=token.credentials)
195
+ if user is None:
196
+ raise error_invalid_token()
197
+ project = await get_project_model_by_name(session=session, project_name=project_name)
198
+ if project is None:
199
+ raise error_not_found()
200
+
201
+ if user.global_role == GlobalRole.ADMIN:
202
+ return user, project
203
+
204
+ project_role = get_user_project_role(user=user, project=project)
205
+ if project_role is not None:
206
+ return user, project
207
+
208
+ raise error_forbidden()
209
+
210
+
138
211
  class OptionalServiceAccount:
139
212
  def __init__(self, token: Optional[str]) -> None:
140
213
  self._token = token
@@ -532,7 +532,7 @@ async def delete_fleets(
532
532
  .options(selectinload(FleetModel.runs))
533
533
  .execution_options(populate_existing=True)
534
534
  .order_by(FleetModel.id) # take locks in order
535
- .with_for_update()
535
+ .with_for_update(key_share=True)
536
536
  )
537
537
  fleet_models = res.scalars().unique().all()
538
538
  fleets = [fleet_model_to_fleet(m) for m in fleet_models]
@@ -240,7 +240,7 @@ async def delete_gateways(
240
240
  .options(selectinload(GatewayModel.gateway_compute))
241
241
  .execution_options(populate_existing=True)
242
242
  .order_by(GatewayModel.id) # take locks in order
243
- .with_for_update()
243
+ .with_for_update(key_share=True)
244
244
  )
245
245
  gateway_models = res.scalars().all()
246
246
  for gateway_model in gateway_models:
@@ -171,6 +171,8 @@ class JobConfigurator(ABC):
171
171
  return result
172
172
 
173
173
  def _dstack_image_commands(self) -> List[str]:
174
+ if self.run_spec.configuration.docker is True:
175
+ return ["start-dockerd"]
174
176
  if (
175
177
  self.run_spec.configuration.image is not None
176
178
  or self.run_spec.configuration.entrypoint is not None
@@ -201,7 +203,9 @@ class JobConfigurator(ABC):
201
203
  return self.run_spec.configuration.home_dir
202
204
 
203
205
  def _image_name(self) -> str:
204
- if self.run_spec.configuration.image is not None:
206
+ if self.run_spec.configuration.docker is True:
207
+ return settings.DSTACK_DIND_IMAGE
208
+ elif self.run_spec.configuration.image is not None:
205
209
  return self.run_spec.configuration.image
206
210
  return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc))
207
211
 
@@ -215,6 +219,8 @@ class JobConfigurator(ABC):
215
219
  return UnixUser.parse(user)
216
220
 
217
221
  def _privileged(self) -> bool:
222
+ if self.run_spec.configuration.docker is True:
223
+ return True
218
224
  return self.run_spec.configuration.privileged
219
225
 
220
226
  def _single_branch(self) -> bool:
@@ -78,14 +78,22 @@ class CloudWatchLogStorage(LogStorage):
78
78
  project.name, request.run_name, request.job_submission_id, log_producer
79
79
  )
80
80
  cw_events: List[_CloudWatchLogEvent]
81
+ next_token: Optional[str] = None
81
82
  with self._wrap_boto_errors():
82
83
  try:
83
- cw_events = self._get_log_events(stream, request)
84
+ cw_events, next_token = self._get_log_events(stream, request)
84
85
  except botocore.exceptions.ClientError as e:
85
86
  if not self._is_resource_not_found_exception(e):
86
87
  raise
87
- logger.debug("Stream %s not found, returning dummy response", stream)
88
- cw_events = []
88
+ # Check if the group exists to distinguish between group not found vs stream not found
89
+ try:
90
+ self._check_group_exists(self._group)
91
+ # Group exists, so the error must be due to missing stream
92
+ logger.debug("Stream %s not found, returning dummy response", stream)
93
+ cw_events = []
94
+ except LogStorageError:
95
+ # Group doesn't exist, re-raise the LogStorageError
96
+ raise
89
97
  logs = [
90
98
  LogEvent(
91
99
  timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
@@ -94,51 +102,43 @@ class CloudWatchLogStorage(LogStorage):
94
102
  )
95
103
  for cw_event in cw_events
96
104
  ]
97
- return JobSubmissionLogs(logs=logs)
105
+ return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
98
106
 
99
- def _get_log_events(self, stream: str, request: PollLogsRequest) -> List[_CloudWatchLogEvent]:
100
- limit = request.limit
107
+ def _get_log_events(
108
+ self, stream: str, request: PollLogsRequest
109
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
110
+ start_from_head = not request.descending
101
111
  parameters = {
102
112
  "logGroupName": self._group,
103
113
  "logStreamName": stream,
104
- "limit": limit,
114
+ "limit": request.limit,
115
+ "startFromHead": start_from_head,
105
116
  }
106
- start_from_head = not request.descending
107
- parameters["startFromHead"] = start_from_head
117
+
108
118
  if request.start_time:
109
- # XXX: Since callers use start_time/end_time for pagination, one millisecond is added
110
- # to avoid an infinite loop because startTime boundary is inclusive.
111
119
  parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + 1
120
+
112
121
  if request.end_time:
113
- # No need to substract one millisecond in this case, though, seems that endTime is
114
- # exclusive, that is, time interval boundaries are [startTime, entTime)
115
122
  parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
116
- # "Partially full or empty pages don't necessarily mean that pagination is finished.
117
- # As long as the nextBackwardToken or nextForwardToken returned is NOT equal to the
118
- # nextToken that you passed into the API call, there might be more log events available."
119
- events: List[_CloudWatchLogEvent] = []
120
- next_token: Optional[str] = None
123
+ elif start_from_head:
124
+ # When startFromHead=true and no endTime is provided, set endTime to "now"
125
+ # to prevent infinite pagination as new logs arrive faster than we can read them
126
+ parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
127
+
128
+ if request.next_token:
129
+ parameters["nextToken"] = request.next_token
130
+
131
+ response = self._client.get_log_events(**parameters)
132
+
133
+ events = response.get("events", [])
121
134
  next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
122
- # Limit max tries to avoid a possible infinite loop if the API is misbehaving
123
- tries_left = 10
124
- while tries_left:
125
- if next_token is not None:
126
- parameters["nextToken"] = next_token
127
- response = self._client.get_log_events(**parameters)
128
- if start_from_head:
129
- events.extend(response["events"])
130
- else:
131
- # Regardless of the startFromHead value log events are arranged in
132
- # chronological order, from earliest to latest.
133
- events.extend(reversed(response["events"]))
134
- if len(events) >= limit:
135
- return events[:limit]
136
- if response[next_token_key] == next_token:
137
- return events
138
- next_token = response[next_token_key]
139
- tries_left -= 1
140
- logger.warning("too many requests to stream %s, returning partial response", stream)
141
- return events
135
+ next_token = response.get(next_token_key)
136
+
137
+ # TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
138
+ if request.descending:
139
+ events = list(reversed(events))
140
+
141
+ return events, next_token
142
142
 
143
143
  def write_logs(
144
144
  self,