dstack 0.19.19__py3-none-any.whl → 0.19.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/oci/resources.py +5 -5
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +4 -0
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/server/app.py +22 -10
- dstack/_internal/server/background/__init__.py +5 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +62 -48
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
- dstack/_internal/server/background/tasks/process_runs.py +63 -20
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +16 -16
- dstack/_internal/server/schemas/logs.py +1 -9
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +10 -14
- dstack/_internal/server/services/jobs/__init__.py +10 -12
- dstack/_internal/server/services/logs/aws.py +45 -3
- dstack/_internal/server/services/logs/filelog.py +121 -11
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/projects.py +35 -15
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +22 -3
- dstack/_internal/server/services/runs.py +74 -34
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js → main-39a767528976f8078166.js} +7 -26
- dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js.map → main-39a767528976f8078166.js.map} +1 -1
- dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
- dstack/_internal/server/testing/common.py +7 -0
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/utils/common.py +10 -21
- dstack/_internal/utils/cron.py +5 -0
- dstack/version.py +1 -1
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/RECORD +54 -49
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import datetime
|
|
3
3
|
import uuid
|
|
4
|
-
from datetime import timedelta
|
|
4
|
+
from datetime import timedelta
|
|
5
5
|
from functools import partial
|
|
6
6
|
from typing import List, Optional, Sequence
|
|
7
7
|
|
|
@@ -11,16 +11,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
11
11
|
from sqlalchemy.orm import selectinload
|
|
12
12
|
|
|
13
13
|
import dstack._internal.utils.random_names as random_names
|
|
14
|
-
from dstack._internal.core.backends import (
|
|
15
|
-
BACKENDS_WITH_GATEWAY_SUPPORT,
|
|
16
|
-
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
|
|
17
|
-
)
|
|
18
14
|
from dstack._internal.core.backends.base.compute import (
|
|
19
15
|
Compute,
|
|
20
16
|
ComputeWithGatewaySupport,
|
|
21
17
|
get_dstack_gateway_wheel,
|
|
22
18
|
get_dstack_runner_version,
|
|
23
19
|
)
|
|
20
|
+
from dstack._internal.core.backends.features import (
|
|
21
|
+
BACKENDS_WITH_GATEWAY_SUPPORT,
|
|
22
|
+
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
|
|
23
|
+
)
|
|
24
24
|
from dstack._internal.core.errors import (
|
|
25
25
|
GatewayError,
|
|
26
26
|
ResourceNotExistsError,
|
|
@@ -86,15 +86,6 @@ async def get_gateway_by_name(
|
|
|
86
86
|
return gateway_model_to_gateway(gateway)
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
async def get_project_default_gateway(
|
|
90
|
-
session: AsyncSession, project: ProjectModel
|
|
91
|
-
) -> Optional[Gateway]:
|
|
92
|
-
gateway: Optional[GatewayModel] = project.default_gateway
|
|
93
|
-
if gateway is None:
|
|
94
|
-
return None
|
|
95
|
-
return gateway_model_to_gateway(gateway)
|
|
96
|
-
|
|
97
|
-
|
|
98
89
|
async def create_gateway_compute(
|
|
99
90
|
project_name: str,
|
|
100
91
|
backend_compute: Compute,
|
|
@@ -181,9 +172,9 @@ async def create_gateway(
|
|
|
181
172
|
session.add(gateway)
|
|
182
173
|
await session.commit()
|
|
183
174
|
|
|
184
|
-
|
|
175
|
+
default_gateway = await get_project_default_gateway_model(session=session, project=project)
|
|
176
|
+
if default_gateway is None or configuration.default:
|
|
185
177
|
await set_default_gateway(session=session, project=project, name=configuration.name)
|
|
186
|
-
|
|
187
178
|
return gateway_model_to_gateway(gateway)
|
|
188
179
|
|
|
189
180
|
|
|
@@ -349,6 +340,15 @@ async def get_project_gateway_model_by_name(
|
|
|
349
340
|
return res.scalar()
|
|
350
341
|
|
|
351
342
|
|
|
343
|
+
async def get_project_default_gateway_model(
|
|
344
|
+
session: AsyncSession, project: ProjectModel
|
|
345
|
+
) -> Optional[GatewayModel]:
|
|
346
|
+
res = await session.execute(
|
|
347
|
+
select(GatewayModel).where(GatewayModel.id == project.default_gateway_id)
|
|
348
|
+
)
|
|
349
|
+
return res.scalar_one_or_none()
|
|
350
|
+
|
|
351
|
+
|
|
352
352
|
async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str:
|
|
353
353
|
gateways = await list_project_gateway_models(session=session, project=project)
|
|
354
354
|
names = {g.name for g in gateways}
|
|
@@ -557,7 +557,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway:
|
|
|
557
557
|
region=gateway_model.region,
|
|
558
558
|
wildcard_domain=gateway_model.wildcard_domain,
|
|
559
559
|
default=gateway_model.project.default_gateway_id == gateway_model.id,
|
|
560
|
-
created_at=gateway_model.created_at
|
|
560
|
+
created_at=gateway_model.created_at,
|
|
561
561
|
status=gateway_model.status,
|
|
562
562
|
status_message=gateway_model.status_message,
|
|
563
563
|
configuration=configuration,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from collections.abc import Container, Iterable
|
|
3
|
-
from datetime import datetime
|
|
3
|
+
from datetime import datetime
|
|
4
4
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import gpuhunt
|
|
@@ -8,11 +8,11 @@ from sqlalchemy import and_, or_, select
|
|
|
8
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
9
|
from sqlalchemy.orm import joinedload
|
|
10
10
|
|
|
11
|
-
from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
12
11
|
from dstack._internal.core.backends.base.offers import (
|
|
13
12
|
offer_to_catalog_item,
|
|
14
13
|
requirements_to_query_filter,
|
|
15
14
|
)
|
|
15
|
+
from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
16
16
|
from dstack._internal.core.models.backends.base import BackendType
|
|
17
17
|
from dstack._internal.core.models.envs import Env
|
|
18
18
|
from dstack._internal.core.models.instances import (
|
|
@@ -34,7 +34,6 @@ from dstack._internal.core.models.profiles import (
|
|
|
34
34
|
TerminationPolicy,
|
|
35
35
|
)
|
|
36
36
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
37
|
-
from dstack._internal.core.models.users import GlobalRole
|
|
38
37
|
from dstack._internal.core.models.volumes import Volume
|
|
39
38
|
from dstack._internal.core.services.profiles import get_termination
|
|
40
39
|
from dstack._internal.server.models import (
|
|
@@ -44,7 +43,7 @@ from dstack._internal.server.models import (
|
|
|
44
43
|
UserModel,
|
|
45
44
|
)
|
|
46
45
|
from dstack._internal.server.services.offers import generate_shared_offer
|
|
47
|
-
from dstack._internal.server.services.projects import
|
|
46
|
+
from dstack._internal.server.services.projects import list_user_project_models
|
|
48
47
|
from dstack._internal.utils import common as common_utils
|
|
49
48
|
from dstack._internal.utils.logging import get_logger
|
|
50
49
|
|
|
@@ -62,7 +61,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
|
62
61
|
status=instance_model.status,
|
|
63
62
|
unreachable=instance_model.unreachable,
|
|
64
63
|
termination_reason=instance_model.termination_reason,
|
|
65
|
-
created=instance_model.created_at
|
|
64
|
+
created=instance_model.created_at,
|
|
66
65
|
total_blocks=instance_model.total_blocks,
|
|
67
66
|
busy_blocks=instance_model.busy_blocks,
|
|
68
67
|
)
|
|
@@ -372,18 +371,15 @@ async def list_user_instances(
|
|
|
372
371
|
limit: int,
|
|
373
372
|
ascending: bool,
|
|
374
373
|
) -> List[Instance]:
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
return []
|
|
381
|
-
|
|
374
|
+
projects = await list_user_project_models(
|
|
375
|
+
session=session,
|
|
376
|
+
user=user,
|
|
377
|
+
only_names=True,
|
|
378
|
+
)
|
|
382
379
|
if project_names is not None:
|
|
383
|
-
projects = [
|
|
380
|
+
projects = [p for p in projects if p.name in project_names]
|
|
384
381
|
if len(projects) == 0:
|
|
385
382
|
return []
|
|
386
|
-
|
|
387
383
|
instance_models = await list_projects_instance_models(
|
|
388
384
|
session=session,
|
|
389
385
|
projects=projects,
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
|
-
from datetime import timedelta
|
|
3
|
+
from datetime import timedelta
|
|
4
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
from sqlalchemy import select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import joinedload
|
|
10
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
11
11
|
|
|
12
12
|
import dstack._internal.server.services.backends as backends_services
|
|
13
13
|
from dstack._internal.core.backends.base.backend import Backend
|
|
@@ -130,7 +130,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
130
130
|
):
|
|
131
131
|
backend_data = json.loads(job_provisioning_data.backend_data)
|
|
132
132
|
job_provisioning_data.backend = backend_data["base_backend"]
|
|
133
|
-
last_processed_at = job_model.last_processed_at
|
|
133
|
+
last_processed_at = job_model.last_processed_at
|
|
134
134
|
finished_at = None
|
|
135
135
|
if job_model.status.is_finished():
|
|
136
136
|
finished_at = last_processed_at
|
|
@@ -140,7 +140,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
140
140
|
id=job_model.id,
|
|
141
141
|
submission_num=job_model.submission_num,
|
|
142
142
|
deployment_num=job_model.deployment_num,
|
|
143
|
-
submitted_at=job_model.submitted_at
|
|
143
|
+
submitted_at=job_model.submitted_at,
|
|
144
144
|
last_processed_at=last_processed_at,
|
|
145
145
|
finished_at=finished_at,
|
|
146
146
|
inactivity_secs=job_model.inactivity_secs,
|
|
@@ -231,10 +231,7 @@ async def process_terminating_job(
|
|
|
231
231
|
Graceful stop should already be done by `process_terminating_run`.
|
|
232
232
|
Caller must acquire the locks on the job and the job's instance.
|
|
233
233
|
"""
|
|
234
|
-
if (
|
|
235
|
-
job_model.remove_at is not None
|
|
236
|
-
and job_model.remove_at.replace(tzinfo=timezone.utc) > common.get_current_datetime()
|
|
237
|
-
):
|
|
234
|
+
if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
|
|
238
235
|
# it's too early to terminate the instance
|
|
239
236
|
return
|
|
240
237
|
|
|
@@ -550,24 +547,25 @@ def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int
|
|
|
550
547
|
return (
|
|
551
548
|
job_model.volumes_detached_at is not None
|
|
552
549
|
and common.get_current_datetime()
|
|
553
|
-
> job_model.volumes_detached_at
|
|
550
|
+
> job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
|
|
554
551
|
and (
|
|
555
552
|
job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
|
|
556
553
|
or stop_duration is not None
|
|
557
554
|
and common.get_current_datetime()
|
|
558
|
-
> job_model.volumes_detached_at
|
|
559
|
-
+ timedelta(seconds=stop_duration)
|
|
555
|
+
> job_model.volumes_detached_at + timedelta(seconds=stop_duration)
|
|
560
556
|
)
|
|
561
557
|
)
|
|
562
558
|
|
|
563
559
|
|
|
564
560
|
async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
|
|
565
561
|
res = await session.execute(
|
|
566
|
-
select(JobModel)
|
|
562
|
+
select(JobModel)
|
|
563
|
+
.where(
|
|
567
564
|
JobModel.status == JobStatus.TERMINATING,
|
|
568
565
|
JobModel.used_instance_id.is_not(None),
|
|
569
566
|
JobModel.volumes_detached_at.is_not(None),
|
|
570
567
|
)
|
|
568
|
+
.options(load_only(JobModel.used_instance_id))
|
|
571
569
|
)
|
|
572
570
|
job_models = res.scalars().all()
|
|
573
571
|
return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
|
|
@@ -55,6 +55,8 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
55
55
|
PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
|
|
56
56
|
# "None of the log events in the batch can be more than 2 hours in the future."
|
|
57
57
|
FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
|
|
58
|
+
# Maximum number of retries when polling for log events to skip empty pages.
|
|
59
|
+
MAX_RETRIES = 10
|
|
58
60
|
|
|
59
61
|
def __init__(self, *, group: str, region: Optional[str] = None) -> None:
|
|
60
62
|
with self._wrap_boto_errors():
|
|
@@ -80,7 +82,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
80
82
|
next_token: Optional[str] = None
|
|
81
83
|
with self._wrap_boto_errors():
|
|
82
84
|
try:
|
|
83
|
-
cw_events, next_token = self.
|
|
85
|
+
cw_events, next_token = self._get_log_events_with_retry(stream, request)
|
|
84
86
|
except botocore.exceptions.ClientError as e:
|
|
85
87
|
if not self._is_resource_not_found_exception(e):
|
|
86
88
|
raise
|
|
@@ -101,7 +103,47 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
101
103
|
)
|
|
102
104
|
for cw_event in cw_events
|
|
103
105
|
]
|
|
104
|
-
return JobSubmissionLogs(logs=logs, next_token=next_token
|
|
106
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
107
|
+
|
|
108
|
+
def _get_log_events_with_retry(
|
|
109
|
+
self, stream: str, request: PollLogsRequest
|
|
110
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
|
|
111
|
+
current_request = request
|
|
112
|
+
previous_next_token = request.next_token
|
|
113
|
+
|
|
114
|
+
for attempt in range(self.MAX_RETRIES):
|
|
115
|
+
cw_events, next_token = self._get_log_events(stream, current_request)
|
|
116
|
+
|
|
117
|
+
if cw_events:
|
|
118
|
+
return cw_events, next_token
|
|
119
|
+
|
|
120
|
+
if not next_token or next_token == previous_next_token:
|
|
121
|
+
return [], None
|
|
122
|
+
|
|
123
|
+
previous_next_token = next_token
|
|
124
|
+
current_request = PollLogsRequest(
|
|
125
|
+
run_name=request.run_name,
|
|
126
|
+
job_submission_id=request.job_submission_id,
|
|
127
|
+
start_time=request.start_time,
|
|
128
|
+
end_time=request.end_time,
|
|
129
|
+
descending=request.descending,
|
|
130
|
+
next_token=next_token,
|
|
131
|
+
limit=request.limit,
|
|
132
|
+
diagnose=request.diagnose,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not request.descending:
|
|
136
|
+
logger.debug(
|
|
137
|
+
"Stream %s: exhausted %d retries without finding logs, returning empty response",
|
|
138
|
+
stream,
|
|
139
|
+
self.MAX_RETRIES,
|
|
140
|
+
)
|
|
141
|
+
# Only return the next token after exhausting retries if going descending—
|
|
142
|
+
# AWS CloudWatch guarantees more logs in that case. In ascending mode,
|
|
143
|
+
# next token is always returned, even if no logs remain.
|
|
144
|
+
# So descending works reliably; ascending has limits if gaps are too large.
|
|
145
|
+
# In the future, UI/CLI should handle retries, and we can return next token for ascending too.
|
|
146
|
+
return [], next_token if request.descending else None
|
|
105
147
|
|
|
106
148
|
def _get_log_events(
|
|
107
149
|
self, stream: str, request: PollLogsRequest
|
|
@@ -115,7 +157,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
115
157
|
}
|
|
116
158
|
|
|
117
159
|
if request.start_time:
|
|
118
|
-
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
|
|
160
|
+
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
|
|
119
161
|
|
|
120
162
|
if request.end_time:
|
|
121
163
|
parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from pathlib import Path
|
|
2
|
-
from typing import List, Union
|
|
3
|
+
from typing import Generator, List, Optional, Tuple, Union
|
|
3
4
|
from uuid import UUID
|
|
4
5
|
|
|
5
6
|
from dstack._internal.core.errors import ServerClientError
|
|
@@ -37,18 +38,17 @@ class FileLogStorage(LogStorage):
|
|
|
37
38
|
producer=log_producer,
|
|
38
39
|
)
|
|
39
40
|
|
|
41
|
+
if request.descending:
|
|
42
|
+
return self._poll_logs_descending(log_file_path, request)
|
|
43
|
+
else:
|
|
44
|
+
return self._poll_logs_ascending(log_file_path, request)
|
|
45
|
+
|
|
46
|
+
def _poll_logs_ascending(
|
|
47
|
+
self, log_file_path: Path, request: PollLogsRequest
|
|
48
|
+
) -> JobSubmissionLogs:
|
|
40
49
|
start_line = 0
|
|
41
50
|
if request.next_token:
|
|
42
|
-
|
|
43
|
-
start_line = int(request.next_token)
|
|
44
|
-
if start_line < 0:
|
|
45
|
-
raise ServerClientError(
|
|
46
|
-
f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
|
|
47
|
-
)
|
|
48
|
-
except ValueError:
|
|
49
|
-
raise ServerClientError(
|
|
50
|
-
f"Invalid next_token: {request.next_token}. Must be a valid integer."
|
|
51
|
-
)
|
|
51
|
+
start_line = self._next_token(request)
|
|
52
52
|
|
|
53
53
|
logs = []
|
|
54
54
|
next_token = None
|
|
@@ -94,6 +94,102 @@ class FileLogStorage(LogStorage):
|
|
|
94
94
|
|
|
95
95
|
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
96
96
|
|
|
97
|
+
def _poll_logs_descending(
|
|
98
|
+
self, log_file_path: Path, request: PollLogsRequest
|
|
99
|
+
) -> JobSubmissionLogs:
|
|
100
|
+
start_offset = self._next_token(request)
|
|
101
|
+
|
|
102
|
+
candidate_logs = []
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
line_generator = self._read_lines_reversed(log_file_path, start_offset)
|
|
106
|
+
|
|
107
|
+
for line_bytes, line_start_offset in line_generator:
|
|
108
|
+
try:
|
|
109
|
+
line_str = line_bytes.decode("utf-8")
|
|
110
|
+
log_event = LogEvent.__response__.parse_raw(line_str)
|
|
111
|
+
except Exception:
|
|
112
|
+
continue # Skip malformed lines
|
|
113
|
+
|
|
114
|
+
if request.end_time is not None and log_event.timestamp > request.end_time:
|
|
115
|
+
continue
|
|
116
|
+
if request.start_time and log_event.timestamp <= request.start_time:
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
candidate_logs.append((log_event, line_start_offset))
|
|
120
|
+
|
|
121
|
+
if len(candidate_logs) > request.limit:
|
|
122
|
+
break
|
|
123
|
+
except FileNotFoundError:
|
|
124
|
+
return JobSubmissionLogs(logs=[], next_token=None)
|
|
125
|
+
|
|
126
|
+
logs = [log for log, offset in candidate_logs[: request.limit]]
|
|
127
|
+
next_token = None
|
|
128
|
+
if len(candidate_logs) > request.limit:
|
|
129
|
+
# We fetched one more than the limit, so there are more pages.
|
|
130
|
+
# The next token should point to the start of the last log we are returning.
|
|
131
|
+
_last_log_event, last_log_offset = candidate_logs[request.limit - 1]
|
|
132
|
+
next_token = str(last_log_offset)
|
|
133
|
+
|
|
134
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _read_lines_reversed(
|
|
138
|
+
filepath: Path, start_offset: Optional[int] = None, chunk_size: int = 8192
|
|
139
|
+
) -> Generator[Tuple[bytes, int], None, None]:
|
|
140
|
+
"""
|
|
141
|
+
A generator that yields lines from a file in reverse order, along with the byte
|
|
142
|
+
offset of the start of each line. This is memory-efficient for large files.
|
|
143
|
+
"""
|
|
144
|
+
with open(filepath, "rb") as f:
|
|
145
|
+
f.seek(0, os.SEEK_END)
|
|
146
|
+
file_size = f.tell()
|
|
147
|
+
cursor = file_size
|
|
148
|
+
|
|
149
|
+
# If a start_offset is provided, optimize by starting the read
|
|
150
|
+
# from a more specific location instead of the end of the file.
|
|
151
|
+
if start_offset is not None and start_offset < file_size:
|
|
152
|
+
# To get the full content of the line that straddles the offset,
|
|
153
|
+
# we need to find its end (the next newline character).
|
|
154
|
+
f.seek(start_offset)
|
|
155
|
+
chunk = f.read(chunk_size)
|
|
156
|
+
newline_pos = chunk.find(b"\n")
|
|
157
|
+
if newline_pos != -1:
|
|
158
|
+
# Found the end of the line. The cursor for reverse reading
|
|
159
|
+
# should start from this point to include the full line.
|
|
160
|
+
cursor = start_offset + newline_pos + 1
|
|
161
|
+
else:
|
|
162
|
+
# No newline found, which means the rest of the file is one line.
|
|
163
|
+
# The default cursor pointing to file_size is correct.
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
buffer = b""
|
|
167
|
+
|
|
168
|
+
while cursor > 0:
|
|
169
|
+
seek_pos = max(0, cursor - chunk_size)
|
|
170
|
+
amount_to_read = cursor - seek_pos
|
|
171
|
+
f.seek(seek_pos)
|
|
172
|
+
chunk = f.read(amount_to_read)
|
|
173
|
+
cursor = seek_pos
|
|
174
|
+
|
|
175
|
+
buffer = chunk + buffer
|
|
176
|
+
|
|
177
|
+
while b"\n" in buffer:
|
|
178
|
+
newline_pos = buffer.rfind(b"\n")
|
|
179
|
+
line = buffer[newline_pos + 1 :]
|
|
180
|
+
line_start_offset = cursor + newline_pos + 1
|
|
181
|
+
|
|
182
|
+
# Skip lines that start at or after the start_offset
|
|
183
|
+
if start_offset is None or line_start_offset < start_offset:
|
|
184
|
+
yield line, line_start_offset
|
|
185
|
+
|
|
186
|
+
buffer = buffer[:newline_pos]
|
|
187
|
+
|
|
188
|
+
# The remaining buffer is the first line of the file.
|
|
189
|
+
# Only yield it if we're not using start_offset or if it starts before start_offset
|
|
190
|
+
if buffer and (start_offset is None or 0 < start_offset):
|
|
191
|
+
yield buffer, 0
|
|
192
|
+
|
|
97
193
|
def write_logs(
|
|
98
194
|
self,
|
|
99
195
|
project: ProjectModel,
|
|
@@ -148,3 +244,17 @@ class FileLogStorage(LogStorage):
|
|
|
148
244
|
log_source=LogEventSource.STDOUT,
|
|
149
245
|
message=runner_log_event.message.decode(errors="replace"),
|
|
150
246
|
)
|
|
247
|
+
|
|
248
|
+
def _next_token(self, request: PollLogsRequest) -> Optional[int]:
|
|
249
|
+
next_token = request.next_token
|
|
250
|
+
if next_token is None:
|
|
251
|
+
return None
|
|
252
|
+
try:
|
|
253
|
+
value = int(next_token)
|
|
254
|
+
if value < 0:
|
|
255
|
+
raise ValueError("Offset must be non-negative")
|
|
256
|
+
return value
|
|
257
|
+
except (ValueError, TypeError):
|
|
258
|
+
raise ServerClientError(
|
|
259
|
+
f"Invalid next_token: {next_token}. Must be a non-negative integer."
|
|
260
|
+
)
|
|
@@ -2,13 +2,13 @@ from typing import List, Literal, Optional, Tuple, Union
|
|
|
2
2
|
|
|
3
3
|
import gpuhunt
|
|
4
4
|
|
|
5
|
-
from dstack._internal.core.backends import
|
|
5
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
6
|
+
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
7
|
+
from dstack._internal.core.backends.features import (
|
|
6
8
|
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
|
|
7
9
|
BACKENDS_WITH_MULTINODE_SUPPORT,
|
|
8
10
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
9
11
|
)
|
|
10
|
-
from dstack._internal.core.backends.base.backend import Backend
|
|
11
|
-
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
12
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
13
13
|
from dstack._internal.core.models.instances import (
|
|
14
14
|
InstanceOfferWithAvailability,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import uuid
|
|
2
|
-
from datetime import timezone
|
|
3
2
|
from typing import Awaitable, Callable, List, Optional, Tuple
|
|
4
3
|
|
|
5
4
|
from sqlalchemy import delete, select, update
|
|
6
5
|
from sqlalchemy import func as safunc
|
|
7
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
-
from sqlalchemy.orm import joinedload
|
|
7
|
+
from sqlalchemy.orm import QueryableAttribute, joinedload, load_only
|
|
9
8
|
|
|
10
9
|
from dstack._internal.core.backends.configurators import get_configurator
|
|
11
10
|
from dstack._internal.core.backends.dstack.models import (
|
|
@@ -54,13 +53,12 @@ async def list_user_projects(
|
|
|
54
53
|
user: UserModel,
|
|
55
54
|
) -> List[Project]:
|
|
56
55
|
"""
|
|
57
|
-
Returns projects where the user is a member.
|
|
56
|
+
Returns projects where the user is a member or all projects for global admins.
|
|
58
57
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
projects = await list_user_project_models(
|
|
59
|
+
session=session,
|
|
60
|
+
user=user,
|
|
61
|
+
)
|
|
64
62
|
projects = sorted(projects, key=lambda p: p.created_at)
|
|
65
63
|
return [
|
|
66
64
|
project_model_to_project(p, include_backends=False, include_members=False)
|
|
@@ -80,7 +78,7 @@ async def list_user_accessible_projects(
|
|
|
80
78
|
if user.global_role == GlobalRole.ADMIN:
|
|
81
79
|
projects = await list_project_models(session=session)
|
|
82
80
|
else:
|
|
83
|
-
member_projects = await
|
|
81
|
+
member_projects = await list_member_project_models(session=session, user=user)
|
|
84
82
|
public_projects = await list_public_non_member_project_models(session=session, user=user)
|
|
85
83
|
projects = member_projects + public_projects
|
|
86
84
|
|
|
@@ -167,7 +165,7 @@ async def delete_projects(
|
|
|
167
165
|
projects_names: List[str],
|
|
168
166
|
):
|
|
169
167
|
if user.global_role != GlobalRole.ADMIN:
|
|
170
|
-
user_projects = await
|
|
168
|
+
user_projects = await list_member_project_models(
|
|
171
169
|
session=session, user=user, include_members=True
|
|
172
170
|
)
|
|
173
171
|
user_project_names = [p.name for p in user_projects]
|
|
@@ -339,9 +337,25 @@ async def clear_project_members(
|
|
|
339
337
|
|
|
340
338
|
|
|
341
339
|
async def list_user_project_models(
|
|
340
|
+
session: AsyncSession,
|
|
341
|
+
user: UserModel,
|
|
342
|
+
only_names: bool = False,
|
|
343
|
+
) -> List[ProjectModel]:
|
|
344
|
+
load_only_attrs = []
|
|
345
|
+
if only_names:
|
|
346
|
+
load_only_attrs += [ProjectModel.id, ProjectModel.name]
|
|
347
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
348
|
+
return await list_project_models(session=session, load_only_attrs=load_only_attrs)
|
|
349
|
+
return await list_member_project_models(
|
|
350
|
+
session=session, user=user, load_only_attrs=load_only_attrs
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
async def list_member_project_models(
|
|
342
355
|
session: AsyncSession,
|
|
343
356
|
user: UserModel,
|
|
344
357
|
include_members: bool = False,
|
|
358
|
+
load_only_attrs: Optional[List[QueryableAttribute]] = None,
|
|
345
359
|
) -> List[ProjectModel]:
|
|
346
360
|
"""
|
|
347
361
|
List project models for a user where they are a member.
|
|
@@ -349,6 +363,8 @@ async def list_user_project_models(
|
|
|
349
363
|
options = []
|
|
350
364
|
if include_members:
|
|
351
365
|
options.append(joinedload(ProjectModel.members))
|
|
366
|
+
if load_only_attrs:
|
|
367
|
+
options.append(load_only(*load_only_attrs))
|
|
352
368
|
res = await session.execute(
|
|
353
369
|
select(ProjectModel)
|
|
354
370
|
.where(
|
|
@@ -395,13 +411,20 @@ async def list_user_owned_project_models(
|
|
|
395
411
|
|
|
396
412
|
async def list_project_models(
|
|
397
413
|
session: AsyncSession,
|
|
414
|
+
load_only_attrs: Optional[List[QueryableAttribute]] = None,
|
|
398
415
|
) -> List[ProjectModel]:
|
|
416
|
+
options = []
|
|
417
|
+
if load_only_attrs:
|
|
418
|
+
options.append(load_only(*load_only_attrs))
|
|
399
419
|
res = await session.execute(
|
|
400
|
-
select(ProjectModel).where(ProjectModel.deleted == False)
|
|
420
|
+
select(ProjectModel).where(ProjectModel.deleted == False).options(*options)
|
|
401
421
|
)
|
|
402
422
|
return list(res.scalars().all())
|
|
403
423
|
|
|
404
424
|
|
|
425
|
+
# TODO: Do not load ProjectModel.backends and ProjectModel.members by default when getting project
|
|
426
|
+
|
|
427
|
+
|
|
405
428
|
async def get_project_model_by_name(
|
|
406
429
|
session: AsyncSession, project_name: str, ignore_case: bool = True
|
|
407
430
|
) -> Optional[ProjectModel]:
|
|
@@ -415,7 +438,6 @@ async def get_project_model_by_name(
|
|
|
415
438
|
.where(*filters)
|
|
416
439
|
.options(joinedload(ProjectModel.backends))
|
|
417
440
|
.options(joinedload(ProjectModel.members))
|
|
418
|
-
.options(joinedload(ProjectModel.default_gateway))
|
|
419
441
|
)
|
|
420
442
|
return res.unique().scalar()
|
|
421
443
|
|
|
@@ -432,7 +454,6 @@ async def get_project_model_by_name_or_error(
|
|
|
432
454
|
)
|
|
433
455
|
.options(joinedload(ProjectModel.backends))
|
|
434
456
|
.options(joinedload(ProjectModel.members))
|
|
435
|
-
.options(joinedload(ProjectModel.default_gateway))
|
|
436
457
|
)
|
|
437
458
|
return res.unique().scalar_one()
|
|
438
459
|
|
|
@@ -449,7 +470,6 @@ async def get_project_model_by_id_or_error(
|
|
|
449
470
|
)
|
|
450
471
|
.options(joinedload(ProjectModel.backends))
|
|
451
472
|
.options(joinedload(ProjectModel.members))
|
|
452
|
-
.options(joinedload(ProjectModel.default_gateway))
|
|
453
473
|
)
|
|
454
474
|
return res.unique().scalar_one()
|
|
455
475
|
|
|
@@ -537,7 +557,7 @@ def project_model_to_project(
|
|
|
537
557
|
project_id=project_model.id,
|
|
538
558
|
project_name=project_model.name,
|
|
539
559
|
owner=users.user_model_to_user(project_model.owner),
|
|
540
|
-
created_at=project_model.created_at
|
|
560
|
+
created_at=project_model.created_at,
|
|
541
561
|
backends=backends,
|
|
542
562
|
members=members,
|
|
543
563
|
is_public=project_model.is_public,
|
|
@@ -5,6 +5,9 @@ class RunMetrics:
|
|
|
5
5
|
"""Wrapper class for run-related Prometheus metrics."""
|
|
6
6
|
|
|
7
7
|
def __init__(self):
|
|
8
|
+
# submit_to_provision_duration reflects real provisioning time
|
|
9
|
+
# but does not reflect how quickly provisioning processing works
|
|
10
|
+
# since it includes scheduling time, retrying, etc.
|
|
8
11
|
self._submit_to_provision_duration = Histogram(
|
|
9
12
|
"dstack_submit_to_provision_duration_seconds",
|
|
10
13
|
"Time from when a run has been submitted and first job provisioning",
|