dstack 0.19.18__py3-none-any.whl → 0.19.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +99 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/compatibility/runs.py +12 -1
- dstack/_internal/core/compatibility/volumes.py +2 -0
- dstack/_internal/core/models/common.py +38 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +30 -10
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +17 -9
- dstack/_internal/server/background/__init__.py +5 -3
- dstack/_internal/server/background/tasks/process_gateways.py +46 -28
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +2 -2
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +354 -72
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +7 -0
- dstack/_internal/server/services/locking.py +3 -1
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +2 -4
- dstack/_internal/server/services/logs/filelog.py +33 -27
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +115 -32
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +13 -0
- dstack/_internal/server/settings.py +7 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-64f8273740c4b52c18f5.js} +6 -6
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
- dstack/_internal/server/testing/common.py +41 -5
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/METADATA +7 -5
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/RECORD +69 -66
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -134,6 +134,8 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
134
134
|
finished_at = None
|
|
135
135
|
if job_model.status.is_finished():
|
|
136
136
|
finished_at = last_processed_at
|
|
137
|
+
status_message = _get_job_status_message(job_model)
|
|
138
|
+
error = _get_job_error(job_model)
|
|
137
139
|
return JobSubmission(
|
|
138
140
|
id=job_model.id,
|
|
139
141
|
submission_num=job_model.submission_num,
|
|
@@ -143,11 +145,13 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
143
145
|
finished_at=finished_at,
|
|
144
146
|
inactivity_secs=job_model.inactivity_secs,
|
|
145
147
|
status=job_model.status,
|
|
148
|
+
status_message=status_message,
|
|
146
149
|
termination_reason=job_model.termination_reason,
|
|
147
150
|
termination_reason_message=job_model.termination_reason_message,
|
|
148
151
|
exit_status=job_model.exit_status,
|
|
149
152
|
job_provisioning_data=job_provisioning_data,
|
|
150
153
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
154
|
+
error=error,
|
|
151
155
|
)
|
|
152
156
|
|
|
153
157
|
|
|
@@ -289,6 +293,19 @@ async def process_terminating_job(
|
|
|
289
293
|
# so that stuck volumes don't prevent the instance from terminating.
|
|
290
294
|
job_model.instance_id = None
|
|
291
295
|
instance_model.last_job_processed_at = common.get_current_datetime()
|
|
296
|
+
|
|
297
|
+
volume_names = (
|
|
298
|
+
jrd.volume_names
|
|
299
|
+
if jrd and jrd.volume_names
|
|
300
|
+
else [va.volume.name for va in instance_model.volume_attachments]
|
|
301
|
+
)
|
|
302
|
+
if volume_names:
|
|
303
|
+
volumes = await list_project_volume_models(
|
|
304
|
+
session=session, project=instance_model.project, names=volume_names
|
|
305
|
+
)
|
|
306
|
+
for volume in volumes:
|
|
307
|
+
volume.last_job_processed_at = common.get_current_datetime()
|
|
308
|
+
|
|
292
309
|
logger.info(
|
|
293
310
|
"%s: instance '%s' has been released, new status is %s",
|
|
294
311
|
fmt(job_model),
|
|
@@ -693,3 +710,31 @@ def _get_job_mount_point_attached_volume(
|
|
|
693
710
|
continue
|
|
694
711
|
return volume
|
|
695
712
|
raise ServerClientError("Failed to find an eligible volume for the mount point")
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _get_job_status_message(job_model: JobModel) -> str:
|
|
716
|
+
if job_model.status == JobStatus.DONE:
|
|
717
|
+
return "exited (0)"
|
|
718
|
+
elif job_model.status == JobStatus.FAILED:
|
|
719
|
+
if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
720
|
+
return f"exited ({job_model.exit_status})"
|
|
721
|
+
elif (
|
|
722
|
+
job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
723
|
+
):
|
|
724
|
+
return "no offers"
|
|
725
|
+
elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
726
|
+
return "interrupted"
|
|
727
|
+
else:
|
|
728
|
+
return "error"
|
|
729
|
+
elif job_model.status == JobStatus.TERMINATED:
|
|
730
|
+
if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
731
|
+
return "stopped"
|
|
732
|
+
elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
733
|
+
return "aborted"
|
|
734
|
+
return job_model.status.value
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def _get_job_error(job_model: JobModel) -> Optional[str]:
|
|
738
|
+
if job_model.termination_reason is None:
|
|
739
|
+
return None
|
|
740
|
+
return job_model.termination_reason.to_error()
|
|
@@ -15,6 +15,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
15
15
|
PortMapping,
|
|
16
16
|
PythonVersion,
|
|
17
17
|
RunConfigurationType,
|
|
18
|
+
ServiceConfiguration,
|
|
18
19
|
)
|
|
19
20
|
from dstack._internal.core.models.profiles import (
|
|
20
21
|
DEFAULT_STOP_DURATION,
|
|
@@ -153,6 +154,7 @@ class JobConfigurator(ABC):
|
|
|
153
154
|
repo_data=self.run_spec.repo_data,
|
|
154
155
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
155
156
|
file_archives=self.run_spec.file_archives,
|
|
157
|
+
service_port=self._service_port(),
|
|
156
158
|
)
|
|
157
159
|
return job_spec
|
|
158
160
|
|
|
@@ -306,6 +308,11 @@ class JobConfigurator(ABC):
|
|
|
306
308
|
)
|
|
307
309
|
return self._job_ssh_key
|
|
308
310
|
|
|
311
|
+
def _service_port(self) -> Optional[int]:
|
|
312
|
+
if isinstance(self.run_spec.configuration, ServiceConfiguration):
|
|
313
|
+
return self.run_spec.configuration.port.container_port
|
|
314
|
+
return None
|
|
315
|
+
|
|
309
316
|
|
|
310
317
|
def interpolate_job_volumes(
|
|
311
318
|
run_volumes: List[Union[MountPoint, str]],
|
|
@@ -172,7 +172,7 @@ async def _wait_to_lock_many(
|
|
|
172
172
|
The keys must be sorted to prevent deadlock.
|
|
173
173
|
"""
|
|
174
174
|
left_to_lock = keys.copy()
|
|
175
|
-
while
|
|
175
|
+
while True:
|
|
176
176
|
async with lock:
|
|
177
177
|
locked_now_num = 0
|
|
178
178
|
for key in left_to_lock:
|
|
@@ -182,4 +182,6 @@ async def _wait_to_lock_many(
|
|
|
182
182
|
locked.add(key)
|
|
183
183
|
locked_now_num += 1
|
|
184
184
|
left_to_lock = left_to_lock[locked_now_num:]
|
|
185
|
+
if not left_to_lock:
|
|
186
|
+
return
|
|
185
187
|
await asyncio.sleep(delay)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from typing import Union
|
|
2
2
|
|
|
3
|
-
from dstack._internal.server.models import JobModel, RunModel
|
|
3
|
+
from dstack._internal.server.models import GatewayModel, JobModel, RunModel
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def fmt(model: Union[RunModel, JobModel]) -> str:
|
|
6
|
+
def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
7
7
|
"""Consistent string representation of a model for logging."""
|
|
8
8
|
if isinstance(model, RunModel):
|
|
9
9
|
return f"run({model.id.hex[:6]}){model.run_name}"
|
|
10
10
|
if isinstance(model, JobModel):
|
|
11
11
|
return f"job({model.id.hex[:6]}){model.job_name}"
|
|
12
|
+
if isinstance(model, GatewayModel):
|
|
13
|
+
return f"gateway({model.id.hex[:6]}){model.name}"
|
|
12
14
|
return str(model)
|
|
@@ -8,7 +8,11 @@ from dstack._internal.server.models import ProjectModel
|
|
|
8
8
|
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
9
9
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
10
10
|
from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
|
|
11
|
-
from dstack._internal.server.services.logs.base import
|
|
11
|
+
from dstack._internal.server.services.logs.base import (
|
|
12
|
+
LogStorage,
|
|
13
|
+
LogStorageError,
|
|
14
|
+
b64encode_raw_message,
|
|
15
|
+
)
|
|
12
16
|
from dstack._internal.server.services.logs.filelog import FileLogStorage
|
|
13
17
|
from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
|
|
14
18
|
from dstack._internal.utils.common import run_async
|
|
@@ -75,4 +79,13 @@ def write_logs(
|
|
|
75
79
|
|
|
76
80
|
|
|
77
81
|
async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
78
|
-
|
|
82
|
+
job_submission_logs = await run_async(
|
|
83
|
+
get_log_storage().poll_logs, project=project, request=request
|
|
84
|
+
)
|
|
85
|
+
# Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility.
|
|
86
|
+
# Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI.
|
|
87
|
+
# We live with that.
|
|
88
|
+
# TODO: Drop base64 encoding in 0.20.
|
|
89
|
+
for log_event in job_submission_logs.logs:
|
|
90
|
+
log_event.message = b64encode_raw_message(log_event.message.encode())
|
|
91
|
+
return job_submission_logs
|
|
@@ -17,7 +17,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
|
17
17
|
from dstack._internal.server.services.logs.base import (
|
|
18
18
|
LogStorage,
|
|
19
19
|
LogStorageError,
|
|
20
|
-
b64encode_raw_message,
|
|
21
20
|
datetime_to_unix_time_ms,
|
|
22
21
|
unix_time_ms_to_datetime,
|
|
23
22
|
)
|
|
@@ -238,8 +237,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
238
237
|
skipped_future_events += 1
|
|
239
238
|
continue
|
|
240
239
|
cw_event = self._runner_log_event_to_cloudwatch_event(event)
|
|
241
|
-
|
|
242
|
-
message_size = len(cw_event["message"]) + self.MESSAGE_OVERHEAD_SIZE
|
|
240
|
+
message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
|
|
243
241
|
if message_size > self.MESSAGE_MAX_SIZE:
|
|
244
242
|
# we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
|
|
245
243
|
# which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
|
|
@@ -271,7 +269,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
271
269
|
) -> _CloudWatchLogEvent:
|
|
272
270
|
return {
|
|
273
271
|
"timestamp": runner_log_event.timestamp,
|
|
274
|
-
"message":
|
|
272
|
+
"message": runner_log_event.message.decode(errors="replace"),
|
|
275
273
|
}
|
|
276
274
|
|
|
277
275
|
@contextmanager
|
|
@@ -2,6 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
from typing import List, Union
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
|
+
from dstack._internal.core.errors import ServerClientError
|
|
5
6
|
from dstack._internal.core.models.logs import (
|
|
6
7
|
JobSubmissionLogs,
|
|
7
8
|
LogEvent,
|
|
@@ -14,8 +15,6 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
|
14
15
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
15
16
|
from dstack._internal.server.services.logs.base import (
|
|
16
17
|
LogStorage,
|
|
17
|
-
LogStorageError,
|
|
18
|
-
b64encode_raw_message,
|
|
19
18
|
unix_time_ms_to_datetime,
|
|
20
19
|
)
|
|
21
20
|
|
|
@@ -30,9 +29,6 @@ class FileLogStorage(LogStorage):
|
|
|
30
29
|
self.root = Path(root)
|
|
31
30
|
|
|
32
31
|
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
33
|
-
if request.descending:
|
|
34
|
-
raise LogStorageError("descending: true is not supported")
|
|
35
|
-
|
|
36
32
|
log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
37
33
|
log_file_path = self._get_log_file_path(
|
|
38
34
|
project_name=project.name,
|
|
@@ -46,11 +42,11 @@ class FileLogStorage(LogStorage):
|
|
|
46
42
|
try:
|
|
47
43
|
start_line = int(request.next_token)
|
|
48
44
|
if start_line < 0:
|
|
49
|
-
raise
|
|
45
|
+
raise ServerClientError(
|
|
50
46
|
f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
|
|
51
47
|
)
|
|
52
48
|
except ValueError:
|
|
53
|
-
raise
|
|
49
|
+
raise ServerClientError(
|
|
54
50
|
f"Invalid next_token: {request.next_token}. Must be a valid integer."
|
|
55
51
|
)
|
|
56
52
|
|
|
@@ -60,31 +56,41 @@ class FileLogStorage(LogStorage):
|
|
|
60
56
|
|
|
61
57
|
try:
|
|
62
58
|
with open(log_file_path) as f:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
59
|
+
# Skip to start_line if needed
|
|
60
|
+
for _ in range(start_line):
|
|
61
|
+
if f.readline() == "":
|
|
62
|
+
# File is shorter than start_line
|
|
63
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
67
64
|
current_line += 1
|
|
68
|
-
continue
|
|
69
65
|
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
# Read lines one by one
|
|
67
|
+
while True:
|
|
68
|
+
line = f.readline()
|
|
69
|
+
if line == "": # EOF
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
current_line += 1
|
|
72
73
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
try:
|
|
75
|
+
log_event = LogEvent.__response__.parse_raw(line)
|
|
76
|
+
except Exception:
|
|
77
|
+
# Skip malformed lines
|
|
78
|
+
continue
|
|
77
79
|
|
|
78
|
-
|
|
80
|
+
if request.start_time and log_event.timestamp <= request.start_time:
|
|
81
|
+
continue
|
|
82
|
+
if request.end_time is not None and log_event.timestamp >= request.end_time:
|
|
83
|
+
break
|
|
79
84
|
|
|
80
|
-
|
|
81
|
-
# Only set next_token if there are more lines to read
|
|
82
|
-
if current_line < len(lines):
|
|
83
|
-
next_token = str(current_line)
|
|
84
|
-
break
|
|
85
|
+
logs.append(log_event)
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
if len(logs) >= request.limit:
|
|
88
|
+
# Check if there are more lines to read
|
|
89
|
+
if f.readline() != "":
|
|
90
|
+
next_token = str(current_line)
|
|
91
|
+
break
|
|
92
|
+
except FileNotFoundError:
|
|
93
|
+
pass
|
|
88
94
|
|
|
89
95
|
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
90
96
|
|
|
@@ -140,5 +146,5 @@ class FileLogStorage(LogStorage):
|
|
|
140
146
|
return LogEvent(
|
|
141
147
|
timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp),
|
|
142
148
|
log_source=LogEventSource.STDOUT,
|
|
143
|
-
message=
|
|
149
|
+
message=runner_log_event.message.decode(errors="replace"),
|
|
144
150
|
)
|
|
@@ -14,7 +14,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
|
14
14
|
from dstack._internal.server.services.logs.base import (
|
|
15
15
|
LogStorage,
|
|
16
16
|
LogStorageError,
|
|
17
|
-
b64encode_raw_message,
|
|
18
17
|
unix_time_ms_to_datetime,
|
|
19
18
|
)
|
|
20
19
|
from dstack._internal.utils.common import batched
|
|
@@ -137,15 +136,14 @@ class GCPLogStorage(LogStorage):
|
|
|
137
136
|
with self.logger.batch() as batcher:
|
|
138
137
|
for batch in batched(logs, self.MAX_BATCH_SIZE):
|
|
139
138
|
for log in batch:
|
|
140
|
-
message =
|
|
139
|
+
message = log.message.decode(errors="replace")
|
|
141
140
|
timestamp = unix_time_ms_to_datetime(log.timestamp)
|
|
142
|
-
|
|
143
|
-
if len(message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
141
|
+
if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
144
142
|
logger.error(
|
|
145
143
|
"Stream %s: skipping event at %s, message exceeds max size: %d > %d",
|
|
146
144
|
stream_name,
|
|
147
145
|
timestamp.isoformat(),
|
|
148
|
-
len(message),
|
|
146
|
+
len(log.message),
|
|
149
147
|
self.MAX_RUNNER_MESSAGE_SIZE,
|
|
150
148
|
)
|
|
151
149
|
continue
|
|
@@ -12,10 +12,12 @@ from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
|
12
12
|
from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
|
|
13
13
|
from dstack._internal.core.models.runs import (
|
|
14
14
|
JobProvisioningData,
|
|
15
|
+
JobSpec,
|
|
15
16
|
JobStatus,
|
|
16
17
|
RunSpec,
|
|
17
18
|
RunStatus,
|
|
18
19
|
ServiceSpec,
|
|
20
|
+
get_service_port,
|
|
19
21
|
)
|
|
20
22
|
from dstack._internal.core.models.services import AnyModel
|
|
21
23
|
from dstack._internal.proxy.lib.models import (
|
|
@@ -97,9 +99,10 @@ class ServerProxyRepo(BaseProxyRepo):
|
|
|
97
99
|
if rci.ssh_proxy is not None:
|
|
98
100
|
ssh_head_proxy = rci.ssh_proxy
|
|
99
101
|
ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
|
|
102
|
+
job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
|
|
100
103
|
replica = Replica(
|
|
101
104
|
id=job.id.hex,
|
|
102
|
-
app_port=run_spec.configuration
|
|
105
|
+
app_port=get_service_port(job_spec, run_spec.configuration),
|
|
103
106
|
ssh_destination=ssh_destination,
|
|
104
107
|
ssh_port=ssh_port,
|
|
105
108
|
ssh_proxy=ssh_proxy,
|
|
@@ -24,6 +24,7 @@ from dstack._internal.core.models.instances import (
|
|
|
24
24
|
)
|
|
25
25
|
from dstack._internal.core.models.profiles import (
|
|
26
26
|
CreationPolicy,
|
|
27
|
+
RetryEvent,
|
|
27
28
|
)
|
|
28
29
|
from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData
|
|
29
30
|
from dstack._internal.core.models.runs import (
|
|
@@ -105,6 +106,8 @@ async def list_user_runs(
|
|
|
105
106
|
repo_id: Optional[str],
|
|
106
107
|
username: Optional[str],
|
|
107
108
|
only_active: bool,
|
|
109
|
+
include_jobs: bool,
|
|
110
|
+
job_submissions_limit: Optional[int],
|
|
108
111
|
prev_submitted_at: Optional[datetime],
|
|
109
112
|
prev_run_id: Optional[uuid.UUID],
|
|
110
113
|
limit: int,
|
|
@@ -148,7 +151,14 @@ async def list_user_runs(
|
|
|
148
151
|
runs = []
|
|
149
152
|
for r in run_models:
|
|
150
153
|
try:
|
|
151
|
-
runs.append(
|
|
154
|
+
runs.append(
|
|
155
|
+
run_model_to_run(
|
|
156
|
+
r,
|
|
157
|
+
return_in_api=True,
|
|
158
|
+
include_jobs=include_jobs,
|
|
159
|
+
job_submissions_limit=job_submissions_limit,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
152
162
|
except pydantic.ValidationError:
|
|
153
163
|
pass
|
|
154
164
|
if len(run_models) > len(runs):
|
|
@@ -652,51 +662,33 @@ async def delete_runs(
|
|
|
652
662
|
|
|
653
663
|
def run_model_to_run(
|
|
654
664
|
run_model: RunModel,
|
|
655
|
-
|
|
665
|
+
include_jobs: bool = True,
|
|
666
|
+
job_submissions_limit: Optional[int] = None,
|
|
656
667
|
return_in_api: bool = False,
|
|
657
668
|
include_sensitive: bool = False,
|
|
658
669
|
) -> Run:
|
|
659
670
|
jobs: List[Job] = []
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
)
|
|
667
|
-
submissions = []
|
|
668
|
-
job_model = None
|
|
669
|
-
for job_model in job_submissions:
|
|
670
|
-
if include_job_submissions:
|
|
671
|
-
job_submission = job_model_to_job_submission(job_model)
|
|
672
|
-
if return_in_api:
|
|
673
|
-
# Set default non-None values for 0.18 backward-compatibility
|
|
674
|
-
# Remove in 0.19
|
|
675
|
-
if job_submission.job_provisioning_data is not None:
|
|
676
|
-
if job_submission.job_provisioning_data.hostname is None:
|
|
677
|
-
job_submission.job_provisioning_data.hostname = ""
|
|
678
|
-
if job_submission.job_provisioning_data.ssh_port is None:
|
|
679
|
-
job_submission.job_provisioning_data.ssh_port = 22
|
|
680
|
-
submissions.append(job_submission)
|
|
681
|
-
if job_model is not None:
|
|
682
|
-
# Use the spec from the latest submission. Submissions can have different specs
|
|
683
|
-
job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
|
|
684
|
-
if not include_sensitive:
|
|
685
|
-
_remove_job_spec_sensitive_info(job_spec)
|
|
686
|
-
jobs.append(Job(job_spec=job_spec, job_submissions=submissions))
|
|
671
|
+
if include_jobs:
|
|
672
|
+
jobs = _get_run_jobs_with_submissions(
|
|
673
|
+
run_model=run_model,
|
|
674
|
+
job_submissions_limit=job_submissions_limit,
|
|
675
|
+
return_in_api=return_in_api,
|
|
676
|
+
include_sensitive=include_sensitive,
|
|
677
|
+
)
|
|
687
678
|
|
|
688
679
|
run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
|
|
689
680
|
|
|
690
681
|
latest_job_submission = None
|
|
691
|
-
if
|
|
682
|
+
if len(jobs) > 0 and len(jobs[0].job_submissions) > 0:
|
|
692
683
|
# TODO(egor-s): does it make sense with replicas and multi-node?
|
|
693
|
-
|
|
694
|
-
latest_job_submission = jobs[0].job_submissions[-1]
|
|
684
|
+
latest_job_submission = jobs[0].job_submissions[-1]
|
|
695
685
|
|
|
696
686
|
service_spec = None
|
|
697
687
|
if run_model.service_spec is not None:
|
|
698
688
|
service_spec = ServiceSpec.__response__.parse_raw(run_model.service_spec)
|
|
699
689
|
|
|
690
|
+
status_message = _get_run_status_message(run_model)
|
|
691
|
+
error = _get_run_error(run_model)
|
|
700
692
|
run = Run(
|
|
701
693
|
id=run_model.id,
|
|
702
694
|
project_name=run_model.project.name,
|
|
@@ -704,18 +696,107 @@ def run_model_to_run(
|
|
|
704
696
|
submitted_at=run_model.submitted_at.replace(tzinfo=timezone.utc),
|
|
705
697
|
last_processed_at=run_model.last_processed_at.replace(tzinfo=timezone.utc),
|
|
706
698
|
status=run_model.status,
|
|
699
|
+
status_message=status_message,
|
|
707
700
|
termination_reason=run_model.termination_reason,
|
|
708
701
|
run_spec=run_spec,
|
|
709
702
|
jobs=jobs,
|
|
710
703
|
latest_job_submission=latest_job_submission,
|
|
711
704
|
service=service_spec,
|
|
712
705
|
deployment_num=run_model.deployment_num,
|
|
706
|
+
error=error,
|
|
713
707
|
deleted=run_model.deleted,
|
|
714
708
|
)
|
|
715
709
|
run.cost = _get_run_cost(run)
|
|
716
710
|
return run
|
|
717
711
|
|
|
718
712
|
|
|
713
|
+
def _get_run_jobs_with_submissions(
|
|
714
|
+
run_model: RunModel,
|
|
715
|
+
job_submissions_limit: Optional[int],
|
|
716
|
+
return_in_api: bool = False,
|
|
717
|
+
include_sensitive: bool = False,
|
|
718
|
+
) -> List[Job]:
|
|
719
|
+
jobs: List[Job] = []
|
|
720
|
+
run_jobs = sorted(run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num))
|
|
721
|
+
for replica_num, replica_submissions in itertools.groupby(
|
|
722
|
+
run_jobs, key=lambda j: j.replica_num
|
|
723
|
+
):
|
|
724
|
+
for job_num, job_models in itertools.groupby(replica_submissions, key=lambda j: j.job_num):
|
|
725
|
+
submissions = []
|
|
726
|
+
job_model = None
|
|
727
|
+
if job_submissions_limit is not None:
|
|
728
|
+
if job_submissions_limit == 0:
|
|
729
|
+
# Take latest job submission to return its job_spec
|
|
730
|
+
job_models = list(job_models)[-1:]
|
|
731
|
+
else:
|
|
732
|
+
job_models = list(job_models)[-job_submissions_limit:]
|
|
733
|
+
for job_model in job_models:
|
|
734
|
+
if job_submissions_limit != 0:
|
|
735
|
+
job_submission = job_model_to_job_submission(job_model)
|
|
736
|
+
if return_in_api:
|
|
737
|
+
# Set default non-None values for 0.18 backward-compatibility
|
|
738
|
+
# Remove in 0.19
|
|
739
|
+
if job_submission.job_provisioning_data is not None:
|
|
740
|
+
if job_submission.job_provisioning_data.hostname is None:
|
|
741
|
+
job_submission.job_provisioning_data.hostname = ""
|
|
742
|
+
if job_submission.job_provisioning_data.ssh_port is None:
|
|
743
|
+
job_submission.job_provisioning_data.ssh_port = 22
|
|
744
|
+
submissions.append(job_submission)
|
|
745
|
+
if job_model is not None:
|
|
746
|
+
# Use the spec from the latest submission. Submissions can have different specs
|
|
747
|
+
job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
|
|
748
|
+
if not include_sensitive:
|
|
749
|
+
_remove_job_spec_sensitive_info(job_spec)
|
|
750
|
+
jobs.append(Job(job_spec=job_spec, job_submissions=submissions))
|
|
751
|
+
return jobs
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def _get_run_status_message(run_model: RunModel) -> str:
|
|
755
|
+
if len(run_model.jobs) == 0:
|
|
756
|
+
return run_model.status.value
|
|
757
|
+
|
|
758
|
+
sorted_job_models = sorted(
|
|
759
|
+
run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num)
|
|
760
|
+
)
|
|
761
|
+
job_models_grouped_by_job = list(
|
|
762
|
+
list(jm)
|
|
763
|
+
for _, jm in itertools.groupby(sorted_job_models, key=lambda j: (j.replica_num, j.job_num))
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
if all(job_models[-1].status == JobStatus.PULLING for job_models in job_models_grouped_by_job):
|
|
767
|
+
# Show `pulling`` if last job submission of all jobs is pulling
|
|
768
|
+
return "pulling"
|
|
769
|
+
|
|
770
|
+
if run_model.status in [RunStatus.SUBMITTED, RunStatus.PENDING]:
|
|
771
|
+
# Show `retrying` if any job caused the run to retry
|
|
772
|
+
for job_models in job_models_grouped_by_job:
|
|
773
|
+
last_job_spec = JobSpec.__response__.parse_raw(job_models[-1].job_spec_data)
|
|
774
|
+
retry_on_events = last_job_spec.retry.on_events if last_job_spec.retry else []
|
|
775
|
+
last_job_termination_reason = _get_last_job_termination_reason(job_models)
|
|
776
|
+
if (
|
|
777
|
+
last_job_termination_reason
|
|
778
|
+
== JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
779
|
+
and RetryEvent.NO_CAPACITY in retry_on_events
|
|
780
|
+
):
|
|
781
|
+
# TODO: Show `retrying` for other retry events
|
|
782
|
+
return "retrying"
|
|
783
|
+
|
|
784
|
+
return run_model.status.value
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
def _get_last_job_termination_reason(job_models: List[JobModel]) -> Optional[JobTerminationReason]:
|
|
788
|
+
for job_model in reversed(job_models):
|
|
789
|
+
if job_model.termination_reason is not None:
|
|
790
|
+
return job_model.termination_reason
|
|
791
|
+
return None
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _get_run_error(run_model: RunModel) -> Optional[str]:
|
|
795
|
+
if run_model.termination_reason is None:
|
|
796
|
+
return None
|
|
797
|
+
return run_model.termination_reason.to_error()
|
|
798
|
+
|
|
799
|
+
|
|
719
800
|
async def _get_pool_offers(
|
|
720
801
|
session: AsyncSession,
|
|
721
802
|
project: ProjectModel,
|
|
@@ -914,6 +995,8 @@ _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
|
|
|
914
995
|
"replicas",
|
|
915
996
|
"scaling",
|
|
916
997
|
# rolling deployment
|
|
998
|
+
# NOTE: keep this list in sync with the "Rolling deployment" section in services.md
|
|
999
|
+
"port",
|
|
917
1000
|
"resources",
|
|
918
1001
|
"volumes",
|
|
919
1002
|
"docker",
|
|
@@ -22,7 +22,7 @@ from dstack._internal.core.errors import (
|
|
|
22
22
|
from dstack._internal.core.models.configurations import SERVICE_HTTPS_DEFAULT, ServiceConfiguration
|
|
23
23
|
from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus
|
|
24
24
|
from dstack._internal.core.models.instances import SSHConnectionParams
|
|
25
|
-
from dstack._internal.core.models.runs import Run, RunSpec, ServiceModelSpec, ServiceSpec
|
|
25
|
+
from dstack._internal.core.models.runs import JobSpec, Run, RunSpec, ServiceModelSpec, ServiceSpec
|
|
26
26
|
from dstack._internal.server import settings
|
|
27
27
|
from dstack._internal.server.models import GatewayModel, JobModel, ProjectModel, RunModel
|
|
28
28
|
from dstack._internal.server.services.gateways import (
|
|
@@ -179,6 +179,7 @@ async def register_replica(
|
|
|
179
179
|
async with conn.client() as client:
|
|
180
180
|
await client.register_replica(
|
|
181
181
|
run=run,
|
|
182
|
+
job_spec=JobSpec.__response__.parse_raw(job_model.job_spec_data),
|
|
182
183
|
job_submission=job_submission,
|
|
183
184
|
ssh_head_proxy=ssh_head_proxy,
|
|
184
185
|
ssh_head_proxy_private_key=ssh_head_proxy_private_key,
|
|
@@ -44,7 +44,9 @@ async def list_users_for_user(
|
|
|
44
44
|
session: AsyncSession,
|
|
45
45
|
user: UserModel,
|
|
46
46
|
) -> List[User]:
|
|
47
|
-
|
|
47
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
48
|
+
return await list_all_users(session=session)
|
|
49
|
+
return [user_model_to_user(user)]
|
|
48
50
|
|
|
49
51
|
|
|
50
52
|
async def list_all_users(
|
|
@@ -401,6 +401,19 @@ def _validate_volume_configuration(configuration: VolumeConfiguration):
|
|
|
401
401
|
if configuration.name is not None:
|
|
402
402
|
validate_dstack_resource_name(configuration.name)
|
|
403
403
|
|
|
404
|
+
if configuration.volume_id is not None and configuration.auto_cleanup_duration is not None:
|
|
405
|
+
if (
|
|
406
|
+
isinstance(configuration.auto_cleanup_duration, int)
|
|
407
|
+
and configuration.auto_cleanup_duration > 0
|
|
408
|
+
) or (
|
|
409
|
+
isinstance(configuration.auto_cleanup_duration, str)
|
|
410
|
+
and configuration.auto_cleanup_duration not in ("off", "-1")
|
|
411
|
+
):
|
|
412
|
+
raise ServerClientError(
|
|
413
|
+
"External volumes (with volume_id) do not support auto_cleanup_duration. "
|
|
414
|
+
"Auto-cleanup only works for volumes created and managed by dstack."
|
|
415
|
+
)
|
|
416
|
+
|
|
404
417
|
|
|
405
418
|
async def _delete_volume(session: AsyncSession, project: ProjectModel, volume_model: VolumeModel):
|
|
406
419
|
volume = volume_model_to_volume(volume_model)
|
|
@@ -42,6 +42,11 @@ SERVER_BACKGROUND_PROCESSING_FACTOR = int(
|
|
|
42
42
|
os.getenv("DSTACK_SERVER_BACKGROUND_PROCESSING_FACTOR", 1)
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
+
SERVER_BACKGROUND_PROCESSING_DISABLED = (
|
|
46
|
+
os.getenv("DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED") is not None
|
|
47
|
+
)
|
|
48
|
+
SERVER_BACKGROUND_PROCESSING_ENABLED = not SERVER_BACKGROUND_PROCESSING_DISABLED
|
|
49
|
+
|
|
45
50
|
SERVER_EXECUTOR_MAX_WORKERS = int(os.getenv("DSTACK_SERVER_EXECUTOR_MAX_WORKERS", 128))
|
|
46
51
|
|
|
47
52
|
MAX_OFFERS_TRIED = int(os.getenv("DSTACK_SERVER_MAX_OFFERS_TRIED", 25))
|
|
@@ -113,5 +118,5 @@ SERVER_PROFILING_ENABLED = os.getenv("DSTACK_SERVER_PROFILING_ENABLED") is not N
|
|
|
113
118
|
|
|
114
119
|
UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_UPDATE_DEFAULT_PROJECT") is not None
|
|
115
120
|
DO_NOT_UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_DO_NOT_UPDATE_DEFAULT_PROJECT") is not None
|
|
116
|
-
SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE"
|
|
117
|
-
ENABLE_PROMETHEUS_METRICS = os.getenv("DSTACK_ENABLE_PROMETHEUS_METRICS"
|
|
121
|
+
SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE") is not None
|
|
122
|
+
ENABLE_PROMETHEUS_METRICS = os.getenv("DSTACK_ENABLE_PROMETHEUS_METRICS") is not None
|