dstack 0.19.18__py3-none-any.whl → 0.19.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (69) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +99 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/compatibility/runs.py +12 -1
  4. dstack/_internal/core/compatibility/volumes.py +2 -0
  5. dstack/_internal/core/models/common.py +38 -2
  6. dstack/_internal/core/models/configurations.py +9 -1
  7. dstack/_internal/core/models/fleets.py +2 -1
  8. dstack/_internal/core/models/profiles.py +8 -5
  9. dstack/_internal/core/models/resources.py +15 -8
  10. dstack/_internal/core/models/runs.py +41 -138
  11. dstack/_internal/core/models/volumes.py +14 -0
  12. dstack/_internal/core/services/diff.py +30 -10
  13. dstack/_internal/core/services/ssh/attach.py +2 -0
  14. dstack/_internal/server/app.py +17 -9
  15. dstack/_internal/server/background/__init__.py +5 -3
  16. dstack/_internal/server/background/tasks/process_gateways.py +46 -28
  17. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  18. dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
  19. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  20. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  21. dstack/_internal/server/models.py +1 -0
  22. dstack/_internal/server/routers/backends.py +23 -16
  23. dstack/_internal/server/routers/files.py +7 -6
  24. dstack/_internal/server/routers/fleets.py +47 -36
  25. dstack/_internal/server/routers/gateways.py +27 -18
  26. dstack/_internal/server/routers/instances.py +18 -13
  27. dstack/_internal/server/routers/logs.py +7 -3
  28. dstack/_internal/server/routers/metrics.py +14 -8
  29. dstack/_internal/server/routers/projects.py +33 -22
  30. dstack/_internal/server/routers/repos.py +7 -6
  31. dstack/_internal/server/routers/runs.py +49 -28
  32. dstack/_internal/server/routers/secrets.py +20 -15
  33. dstack/_internal/server/routers/server.py +7 -4
  34. dstack/_internal/server/routers/users.py +22 -19
  35. dstack/_internal/server/routers/volumes.py +34 -25
  36. dstack/_internal/server/schemas/logs.py +2 -2
  37. dstack/_internal/server/schemas/runs.py +17 -5
  38. dstack/_internal/server/services/fleets.py +354 -72
  39. dstack/_internal/server/services/gateways/__init__.py +13 -4
  40. dstack/_internal/server/services/gateways/client.py +5 -3
  41. dstack/_internal/server/services/instances.py +8 -0
  42. dstack/_internal/server/services/jobs/__init__.py +45 -0
  43. dstack/_internal/server/services/jobs/configurators/base.py +7 -0
  44. dstack/_internal/server/services/locking.py +3 -1
  45. dstack/_internal/server/services/logging.py +4 -2
  46. dstack/_internal/server/services/logs/__init__.py +15 -2
  47. dstack/_internal/server/services/logs/aws.py +2 -4
  48. dstack/_internal/server/services/logs/filelog.py +33 -27
  49. dstack/_internal/server/services/logs/gcp.py +3 -5
  50. dstack/_internal/server/services/proxy/repo.py +4 -1
  51. dstack/_internal/server/services/runs.py +115 -32
  52. dstack/_internal/server/services/services/__init__.py +2 -1
  53. dstack/_internal/server/services/users.py +3 -1
  54. dstack/_internal/server/services/volumes.py +13 -0
  55. dstack/_internal/server/settings.py +7 -2
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-64f8273740c4b52c18f5.js} +6 -6
  58. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
  59. dstack/_internal/server/testing/common.py +41 -5
  60. dstack/_internal/server/utils/routers.py +31 -8
  61. dstack/_internal/utils/json_utils.py +54 -0
  62. dstack/api/_public/runs.py +13 -2
  63. dstack/api/server/_runs.py +12 -2
  64. dstack/version.py +1 -1
  65. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/METADATA +7 -5
  66. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/RECORD +69 -66
  67. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
  68. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
  69. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -134,6 +134,8 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
134
134
  finished_at = None
135
135
  if job_model.status.is_finished():
136
136
  finished_at = last_processed_at
137
+ status_message = _get_job_status_message(job_model)
138
+ error = _get_job_error(job_model)
137
139
  return JobSubmission(
138
140
  id=job_model.id,
139
141
  submission_num=job_model.submission_num,
@@ -143,11 +145,13 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
143
145
  finished_at=finished_at,
144
146
  inactivity_secs=job_model.inactivity_secs,
145
147
  status=job_model.status,
148
+ status_message=status_message,
146
149
  termination_reason=job_model.termination_reason,
147
150
  termination_reason_message=job_model.termination_reason_message,
148
151
  exit_status=job_model.exit_status,
149
152
  job_provisioning_data=job_provisioning_data,
150
153
  job_runtime_data=get_job_runtime_data(job_model),
154
+ error=error,
151
155
  )
152
156
 
153
157
 
@@ -289,6 +293,19 @@ async def process_terminating_job(
289
293
  # so that stuck volumes don't prevent the instance from terminating.
290
294
  job_model.instance_id = None
291
295
  instance_model.last_job_processed_at = common.get_current_datetime()
296
+
297
+ volume_names = (
298
+ jrd.volume_names
299
+ if jrd and jrd.volume_names
300
+ else [va.volume.name for va in instance_model.volume_attachments]
301
+ )
302
+ if volume_names:
303
+ volumes = await list_project_volume_models(
304
+ session=session, project=instance_model.project, names=volume_names
305
+ )
306
+ for volume in volumes:
307
+ volume.last_job_processed_at = common.get_current_datetime()
308
+
292
309
  logger.info(
293
310
  "%s: instance '%s' has been released, new status is %s",
294
311
  fmt(job_model),
@@ -693,3 +710,31 @@ def _get_job_mount_point_attached_volume(
693
710
  continue
694
711
  return volume
695
712
  raise ServerClientError("Failed to find an eligible volume for the mount point")
713
+
714
+
715
+ def _get_job_status_message(job_model: JobModel) -> str:
716
+ if job_model.status == JobStatus.DONE:
717
+ return "exited (0)"
718
+ elif job_model.status == JobStatus.FAILED:
719
+ if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
720
+ return f"exited ({job_model.exit_status})"
721
+ elif (
722
+ job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
723
+ ):
724
+ return "no offers"
725
+ elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
726
+ return "interrupted"
727
+ else:
728
+ return "error"
729
+ elif job_model.status == JobStatus.TERMINATED:
730
+ if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
731
+ return "stopped"
732
+ elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
733
+ return "aborted"
734
+ return job_model.status.value
735
+
736
+
737
+ def _get_job_error(job_model: JobModel) -> Optional[str]:
738
+ if job_model.termination_reason is None:
739
+ return None
740
+ return job_model.termination_reason.to_error()
@@ -15,6 +15,7 @@ from dstack._internal.core.models.configurations import (
15
15
  PortMapping,
16
16
  PythonVersion,
17
17
  RunConfigurationType,
18
+ ServiceConfiguration,
18
19
  )
19
20
  from dstack._internal.core.models.profiles import (
20
21
  DEFAULT_STOP_DURATION,
@@ -153,6 +154,7 @@ class JobConfigurator(ABC):
153
154
  repo_data=self.run_spec.repo_data,
154
155
  repo_code_hash=self.run_spec.repo_code_hash,
155
156
  file_archives=self.run_spec.file_archives,
157
+ service_port=self._service_port(),
156
158
  )
157
159
  return job_spec
158
160
 
@@ -306,6 +308,11 @@ class JobConfigurator(ABC):
306
308
  )
307
309
  return self._job_ssh_key
308
310
 
311
+ def _service_port(self) -> Optional[int]:
312
+ if isinstance(self.run_spec.configuration, ServiceConfiguration):
313
+ return self.run_spec.configuration.port.container_port
314
+ return None
315
+
309
316
 
310
317
  def interpolate_job_volumes(
311
318
  run_volumes: List[Union[MountPoint, str]],
@@ -172,7 +172,7 @@ async def _wait_to_lock_many(
172
172
  The keys must be sorted to prevent deadlock.
173
173
  """
174
174
  left_to_lock = keys.copy()
175
- while len(left_to_lock) > 0:
175
+ while True:
176
176
  async with lock:
177
177
  locked_now_num = 0
178
178
  for key in left_to_lock:
@@ -182,4 +182,6 @@ async def _wait_to_lock_many(
182
182
  locked.add(key)
183
183
  locked_now_num += 1
184
184
  left_to_lock = left_to_lock[locked_now_num:]
185
+ if not left_to_lock:
186
+ return
185
187
  await asyncio.sleep(delay)
@@ -1,12 +1,14 @@
1
1
  from typing import Union
2
2
 
3
- from dstack._internal.server.models import JobModel, RunModel
3
+ from dstack._internal.server.models import GatewayModel, JobModel, RunModel
4
4
 
5
5
 
6
- def fmt(model: Union[RunModel, JobModel]) -> str:
6
+ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
7
7
  """Consistent string representation of a model for logging."""
8
8
  if isinstance(model, RunModel):
9
9
  return f"run({model.id.hex[:6]}){model.run_name}"
10
10
  if isinstance(model, JobModel):
11
11
  return f"job({model.id.hex[:6]}){model.job_name}"
12
+ if isinstance(model, GatewayModel):
13
+ return f"gateway({model.id.hex[:6]}){model.name}"
12
14
  return str(model)
@@ -8,7 +8,11 @@ from dstack._internal.server.models import ProjectModel
8
8
  from dstack._internal.server.schemas.logs import PollLogsRequest
9
9
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
10
10
  from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
11
- from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
11
+ from dstack._internal.server.services.logs.base import (
12
+ LogStorage,
13
+ LogStorageError,
14
+ b64encode_raw_message,
15
+ )
12
16
  from dstack._internal.server.services.logs.filelog import FileLogStorage
13
17
  from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
14
18
  from dstack._internal.utils.common import run_async
@@ -75,4 +79,13 @@ def write_logs(
75
79
 
76
80
 
77
81
  async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
78
- return await run_async(get_log_storage().poll_logs, project=project, request=request)
82
+ job_submission_logs = await run_async(
83
+ get_log_storage().poll_logs, project=project, request=request
84
+ )
85
+ # Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility.
86
+ # Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI.
87
+ # We live with that.
88
+ # TODO: Drop base64 encoding in 0.20.
89
+ for log_event in job_submission_logs.logs:
90
+ log_event.message = b64encode_raw_message(log_event.message.encode())
91
+ return job_submission_logs
@@ -17,7 +17,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
17
17
  from dstack._internal.server.services.logs.base import (
18
18
  LogStorage,
19
19
  LogStorageError,
20
- b64encode_raw_message,
21
20
  datetime_to_unix_time_ms,
22
21
  unix_time_ms_to_datetime,
23
22
  )
@@ -238,8 +237,7 @@ class CloudWatchLogStorage(LogStorage):
238
237
  skipped_future_events += 1
239
238
  continue
240
239
  cw_event = self._runner_log_event_to_cloudwatch_event(event)
241
- # as message is base64-encoded, length in bytes = length in code points.
242
- message_size = len(cw_event["message"]) + self.MESSAGE_OVERHEAD_SIZE
240
+ message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
243
241
  if message_size > self.MESSAGE_MAX_SIZE:
244
242
  # we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
245
243
  # which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
@@ -271,7 +269,7 @@ class CloudWatchLogStorage(LogStorage):
271
269
  ) -> _CloudWatchLogEvent:
272
270
  return {
273
271
  "timestamp": runner_log_event.timestamp,
274
- "message": b64encode_raw_message(runner_log_event.message),
272
+ "message": runner_log_event.message.decode(errors="replace"),
275
273
  }
276
274
 
277
275
  @contextmanager
@@ -2,6 +2,7 @@ from pathlib import Path
2
2
  from typing import List, Union
3
3
  from uuid import UUID
4
4
 
5
+ from dstack._internal.core.errors import ServerClientError
5
6
  from dstack._internal.core.models.logs import (
6
7
  JobSubmissionLogs,
7
8
  LogEvent,
@@ -14,8 +15,6 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
14
15
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
15
16
  from dstack._internal.server.services.logs.base import (
16
17
  LogStorage,
17
- LogStorageError,
18
- b64encode_raw_message,
19
18
  unix_time_ms_to_datetime,
20
19
  )
21
20
 
@@ -30,9 +29,6 @@ class FileLogStorage(LogStorage):
30
29
  self.root = Path(root)
31
30
 
32
31
  def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
33
- if request.descending:
34
- raise LogStorageError("descending: true is not supported")
35
-
36
32
  log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
37
33
  log_file_path = self._get_log_file_path(
38
34
  project_name=project.name,
@@ -46,11 +42,11 @@ class FileLogStorage(LogStorage):
46
42
  try:
47
43
  start_line = int(request.next_token)
48
44
  if start_line < 0:
49
- raise LogStorageError(
45
+ raise ServerClientError(
50
46
  f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
51
47
  )
52
48
  except ValueError:
53
- raise LogStorageError(
49
+ raise ServerClientError(
54
50
  f"Invalid next_token: {request.next_token}. Must be a valid integer."
55
51
  )
56
52
 
@@ -60,31 +56,41 @@ class FileLogStorage(LogStorage):
60
56
 
61
57
  try:
62
58
  with open(log_file_path) as f:
63
- lines = f.readlines()
64
-
65
- for i, line in enumerate(lines):
66
- if current_line < start_line:
59
+ # Skip to start_line if needed
60
+ for _ in range(start_line):
61
+ if f.readline() == "":
62
+ # File is shorter than start_line
63
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
67
64
  current_line += 1
68
- continue
69
65
 
70
- log_event = LogEvent.__response__.parse_raw(line)
71
- current_line += 1
66
+ # Read lines one by one
67
+ while True:
68
+ line = f.readline()
69
+ if line == "": # EOF
70
+ break
71
+
72
+ current_line += 1
72
73
 
73
- if request.start_time and log_event.timestamp <= request.start_time:
74
- continue
75
- if request.end_time is not None and log_event.timestamp >= request.end_time:
76
- break
74
+ try:
75
+ log_event = LogEvent.__response__.parse_raw(line)
76
+ except Exception:
77
+ # Skip malformed lines
78
+ continue
77
79
 
78
- logs.append(log_event)
80
+ if request.start_time and log_event.timestamp <= request.start_time:
81
+ continue
82
+ if request.end_time is not None and log_event.timestamp >= request.end_time:
83
+ break
79
84
 
80
- if len(logs) >= request.limit:
81
- # Only set next_token if there are more lines to read
82
- if current_line < len(lines):
83
- next_token = str(current_line)
84
- break
85
+ logs.append(log_event)
85
86
 
86
- except IOError as e:
87
- raise LogStorageError(f"Failed to read log file {log_file_path}: {e}")
87
+ if len(logs) >= request.limit:
88
+ # Check if there are more lines to read
89
+ if f.readline() != "":
90
+ next_token = str(current_line)
91
+ break
92
+ except FileNotFoundError:
93
+ pass
88
94
 
89
95
  return JobSubmissionLogs(logs=logs, next_token=next_token)
90
96
 
@@ -140,5 +146,5 @@ class FileLogStorage(LogStorage):
140
146
  return LogEvent(
141
147
  timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp),
142
148
  log_source=LogEventSource.STDOUT,
143
- message=b64encode_raw_message(runner_log_event.message),
149
+ message=runner_log_event.message.decode(errors="replace"),
144
150
  )
@@ -14,7 +14,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
14
14
  from dstack._internal.server.services.logs.base import (
15
15
  LogStorage,
16
16
  LogStorageError,
17
- b64encode_raw_message,
18
17
  unix_time_ms_to_datetime,
19
18
  )
20
19
  from dstack._internal.utils.common import batched
@@ -137,15 +136,14 @@ class GCPLogStorage(LogStorage):
137
136
  with self.logger.batch() as batcher:
138
137
  for batch in batched(logs, self.MAX_BATCH_SIZE):
139
138
  for log in batch:
140
- message = b64encode_raw_message(log.message)
139
+ message = log.message.decode(errors="replace")
141
140
  timestamp = unix_time_ms_to_datetime(log.timestamp)
142
- # as message is base64-encoded, length in bytes = length in code points
143
- if len(message) > self.MAX_RUNNER_MESSAGE_SIZE:
141
+ if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
144
142
  logger.error(
145
143
  "Stream %s: skipping event at %s, message exceeds max size: %d > %d",
146
144
  stream_name,
147
145
  timestamp.isoformat(),
148
- len(message),
146
+ len(log.message),
149
147
  self.MAX_RUNNER_MESSAGE_SIZE,
150
148
  )
151
149
  continue
@@ -12,10 +12,12 @@ from dstack._internal.core.models.configurations import ServiceConfiguration
12
12
  from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
13
13
  from dstack._internal.core.models.runs import (
14
14
  JobProvisioningData,
15
+ JobSpec,
15
16
  JobStatus,
16
17
  RunSpec,
17
18
  RunStatus,
18
19
  ServiceSpec,
20
+ get_service_port,
19
21
  )
20
22
  from dstack._internal.core.models.services import AnyModel
21
23
  from dstack._internal.proxy.lib.models import (
@@ -97,9 +99,10 @@ class ServerProxyRepo(BaseProxyRepo):
97
99
  if rci.ssh_proxy is not None:
98
100
  ssh_head_proxy = rci.ssh_proxy
99
101
  ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
102
+ job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
100
103
  replica = Replica(
101
104
  id=job.id.hex,
102
- app_port=run_spec.configuration.port.container_port,
105
+ app_port=get_service_port(job_spec, run_spec.configuration),
103
106
  ssh_destination=ssh_destination,
104
107
  ssh_port=ssh_port,
105
108
  ssh_proxy=ssh_proxy,
@@ -24,6 +24,7 @@ from dstack._internal.core.models.instances import (
24
24
  )
25
25
  from dstack._internal.core.models.profiles import (
26
26
  CreationPolicy,
27
+ RetryEvent,
27
28
  )
28
29
  from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData
29
30
  from dstack._internal.core.models.runs import (
@@ -105,6 +106,8 @@ async def list_user_runs(
105
106
  repo_id: Optional[str],
106
107
  username: Optional[str],
107
108
  only_active: bool,
109
+ include_jobs: bool,
110
+ job_submissions_limit: Optional[int],
108
111
  prev_submitted_at: Optional[datetime],
109
112
  prev_run_id: Optional[uuid.UUID],
110
113
  limit: int,
@@ -148,7 +151,14 @@ async def list_user_runs(
148
151
  runs = []
149
152
  for r in run_models:
150
153
  try:
151
- runs.append(run_model_to_run(r, return_in_api=True))
154
+ runs.append(
155
+ run_model_to_run(
156
+ r,
157
+ return_in_api=True,
158
+ include_jobs=include_jobs,
159
+ job_submissions_limit=job_submissions_limit,
160
+ )
161
+ )
152
162
  except pydantic.ValidationError:
153
163
  pass
154
164
  if len(run_models) > len(runs):
@@ -652,51 +662,33 @@ async def delete_runs(
652
662
 
653
663
  def run_model_to_run(
654
664
  run_model: RunModel,
655
- include_job_submissions: bool = True,
665
+ include_jobs: bool = True,
666
+ job_submissions_limit: Optional[int] = None,
656
667
  return_in_api: bool = False,
657
668
  include_sensitive: bool = False,
658
669
  ) -> Run:
659
670
  jobs: List[Job] = []
660
- run_jobs = sorted(run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num))
661
- for replica_num, replica_submissions in itertools.groupby(
662
- run_jobs, key=lambda j: j.replica_num
663
- ):
664
- for job_num, job_submissions in itertools.groupby(
665
- replica_submissions, key=lambda j: j.job_num
666
- ):
667
- submissions = []
668
- job_model = None
669
- for job_model in job_submissions:
670
- if include_job_submissions:
671
- job_submission = job_model_to_job_submission(job_model)
672
- if return_in_api:
673
- # Set default non-None values for 0.18 backward-compatibility
674
- # Remove in 0.19
675
- if job_submission.job_provisioning_data is not None:
676
- if job_submission.job_provisioning_data.hostname is None:
677
- job_submission.job_provisioning_data.hostname = ""
678
- if job_submission.job_provisioning_data.ssh_port is None:
679
- job_submission.job_provisioning_data.ssh_port = 22
680
- submissions.append(job_submission)
681
- if job_model is not None:
682
- # Use the spec from the latest submission. Submissions can have different specs
683
- job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
684
- if not include_sensitive:
685
- _remove_job_spec_sensitive_info(job_spec)
686
- jobs.append(Job(job_spec=job_spec, job_submissions=submissions))
671
+ if include_jobs:
672
+ jobs = _get_run_jobs_with_submissions(
673
+ run_model=run_model,
674
+ job_submissions_limit=job_submissions_limit,
675
+ return_in_api=return_in_api,
676
+ include_sensitive=include_sensitive,
677
+ )
687
678
 
688
679
  run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
689
680
 
690
681
  latest_job_submission = None
691
- if include_job_submissions:
682
+ if len(jobs) > 0 and len(jobs[0].job_submissions) > 0:
692
683
  # TODO(egor-s): does it make sense with replicas and multi-node?
693
- if jobs:
694
- latest_job_submission = jobs[0].job_submissions[-1]
684
+ latest_job_submission = jobs[0].job_submissions[-1]
695
685
 
696
686
  service_spec = None
697
687
  if run_model.service_spec is not None:
698
688
  service_spec = ServiceSpec.__response__.parse_raw(run_model.service_spec)
699
689
 
690
+ status_message = _get_run_status_message(run_model)
691
+ error = _get_run_error(run_model)
700
692
  run = Run(
701
693
  id=run_model.id,
702
694
  project_name=run_model.project.name,
@@ -704,18 +696,107 @@ def run_model_to_run(
704
696
  submitted_at=run_model.submitted_at.replace(tzinfo=timezone.utc),
705
697
  last_processed_at=run_model.last_processed_at.replace(tzinfo=timezone.utc),
706
698
  status=run_model.status,
699
+ status_message=status_message,
707
700
  termination_reason=run_model.termination_reason,
708
701
  run_spec=run_spec,
709
702
  jobs=jobs,
710
703
  latest_job_submission=latest_job_submission,
711
704
  service=service_spec,
712
705
  deployment_num=run_model.deployment_num,
706
+ error=error,
713
707
  deleted=run_model.deleted,
714
708
  )
715
709
  run.cost = _get_run_cost(run)
716
710
  return run
717
711
 
718
712
 
713
+ def _get_run_jobs_with_submissions(
714
+ run_model: RunModel,
715
+ job_submissions_limit: Optional[int],
716
+ return_in_api: bool = False,
717
+ include_sensitive: bool = False,
718
+ ) -> List[Job]:
719
+ jobs: List[Job] = []
720
+ run_jobs = sorted(run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num))
721
+ for replica_num, replica_submissions in itertools.groupby(
722
+ run_jobs, key=lambda j: j.replica_num
723
+ ):
724
+ for job_num, job_models in itertools.groupby(replica_submissions, key=lambda j: j.job_num):
725
+ submissions = []
726
+ job_model = None
727
+ if job_submissions_limit is not None:
728
+ if job_submissions_limit == 0:
729
+ # Take latest job submission to return its job_spec
730
+ job_models = list(job_models)[-1:]
731
+ else:
732
+ job_models = list(job_models)[-job_submissions_limit:]
733
+ for job_model in job_models:
734
+ if job_submissions_limit != 0:
735
+ job_submission = job_model_to_job_submission(job_model)
736
+ if return_in_api:
737
+ # Set default non-None values for 0.18 backward-compatibility
738
+ # Remove in 0.19
739
+ if job_submission.job_provisioning_data is not None:
740
+ if job_submission.job_provisioning_data.hostname is None:
741
+ job_submission.job_provisioning_data.hostname = ""
742
+ if job_submission.job_provisioning_data.ssh_port is None:
743
+ job_submission.job_provisioning_data.ssh_port = 22
744
+ submissions.append(job_submission)
745
+ if job_model is not None:
746
+ # Use the spec from the latest submission. Submissions can have different specs
747
+ job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
748
+ if not include_sensitive:
749
+ _remove_job_spec_sensitive_info(job_spec)
750
+ jobs.append(Job(job_spec=job_spec, job_submissions=submissions))
751
+ return jobs
752
+
753
+
754
+ def _get_run_status_message(run_model: RunModel) -> str:
755
+ if len(run_model.jobs) == 0:
756
+ return run_model.status.value
757
+
758
+ sorted_job_models = sorted(
759
+ run_model.jobs, key=lambda j: (j.replica_num, j.job_num, j.submission_num)
760
+ )
761
+ job_models_grouped_by_job = list(
762
+ list(jm)
763
+ for _, jm in itertools.groupby(sorted_job_models, key=lambda j: (j.replica_num, j.job_num))
764
+ )
765
+
766
+ if all(job_models[-1].status == JobStatus.PULLING for job_models in job_models_grouped_by_job):
767
+ # Show `pulling`` if last job submission of all jobs is pulling
768
+ return "pulling"
769
+
770
+ if run_model.status in [RunStatus.SUBMITTED, RunStatus.PENDING]:
771
+ # Show `retrying` if any job caused the run to retry
772
+ for job_models in job_models_grouped_by_job:
773
+ last_job_spec = JobSpec.__response__.parse_raw(job_models[-1].job_spec_data)
774
+ retry_on_events = last_job_spec.retry.on_events if last_job_spec.retry else []
775
+ last_job_termination_reason = _get_last_job_termination_reason(job_models)
776
+ if (
777
+ last_job_termination_reason
778
+ == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
779
+ and RetryEvent.NO_CAPACITY in retry_on_events
780
+ ):
781
+ # TODO: Show `retrying` for other retry events
782
+ return "retrying"
783
+
784
+ return run_model.status.value
785
+
786
+
787
+ def _get_last_job_termination_reason(job_models: List[JobModel]) -> Optional[JobTerminationReason]:
788
+ for job_model in reversed(job_models):
789
+ if job_model.termination_reason is not None:
790
+ return job_model.termination_reason
791
+ return None
792
+
793
+
794
+ def _get_run_error(run_model: RunModel) -> Optional[str]:
795
+ if run_model.termination_reason is None:
796
+ return None
797
+ return run_model.termination_reason.to_error()
798
+
799
+
719
800
  async def _get_pool_offers(
720
801
  session: AsyncSession,
721
802
  project: ProjectModel,
@@ -914,6 +995,8 @@ _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
914
995
  "replicas",
915
996
  "scaling",
916
997
  # rolling deployment
998
+ # NOTE: keep this list in sync with the "Rolling deployment" section in services.md
999
+ "port",
917
1000
  "resources",
918
1001
  "volumes",
919
1002
  "docker",
@@ -22,7 +22,7 @@ from dstack._internal.core.errors import (
22
22
  from dstack._internal.core.models.configurations import SERVICE_HTTPS_DEFAULT, ServiceConfiguration
23
23
  from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus
24
24
  from dstack._internal.core.models.instances import SSHConnectionParams
25
- from dstack._internal.core.models.runs import Run, RunSpec, ServiceModelSpec, ServiceSpec
25
+ from dstack._internal.core.models.runs import JobSpec, Run, RunSpec, ServiceModelSpec, ServiceSpec
26
26
  from dstack._internal.server import settings
27
27
  from dstack._internal.server.models import GatewayModel, JobModel, ProjectModel, RunModel
28
28
  from dstack._internal.server.services.gateways import (
@@ -179,6 +179,7 @@ async def register_replica(
179
179
  async with conn.client() as client:
180
180
  await client.register_replica(
181
181
  run=run,
182
+ job_spec=JobSpec.__response__.parse_raw(job_model.job_spec_data),
182
183
  job_submission=job_submission,
183
184
  ssh_head_proxy=ssh_head_proxy,
184
185
  ssh_head_proxy_private_key=ssh_head_proxy_private_key,
@@ -44,7 +44,9 @@ async def list_users_for_user(
44
44
  session: AsyncSession,
45
45
  user: UserModel,
46
46
  ) -> List[User]:
47
- return await list_all_users(session=session)
47
+ if user.global_role == GlobalRole.ADMIN:
48
+ return await list_all_users(session=session)
49
+ return [user_model_to_user(user)]
48
50
 
49
51
 
50
52
  async def list_all_users(
@@ -401,6 +401,19 @@ def _validate_volume_configuration(configuration: VolumeConfiguration):
401
401
  if configuration.name is not None:
402
402
  validate_dstack_resource_name(configuration.name)
403
403
 
404
+ if configuration.volume_id is not None and configuration.auto_cleanup_duration is not None:
405
+ if (
406
+ isinstance(configuration.auto_cleanup_duration, int)
407
+ and configuration.auto_cleanup_duration > 0
408
+ ) or (
409
+ isinstance(configuration.auto_cleanup_duration, str)
410
+ and configuration.auto_cleanup_duration not in ("off", "-1")
411
+ ):
412
+ raise ServerClientError(
413
+ "External volumes (with volume_id) do not support auto_cleanup_duration. "
414
+ "Auto-cleanup only works for volumes created and managed by dstack."
415
+ )
416
+
404
417
 
405
418
  async def _delete_volume(session: AsyncSession, project: ProjectModel, volume_model: VolumeModel):
406
419
  volume = volume_model_to_volume(volume_model)
@@ -42,6 +42,11 @@ SERVER_BACKGROUND_PROCESSING_FACTOR = int(
42
42
  os.getenv("DSTACK_SERVER_BACKGROUND_PROCESSING_FACTOR", 1)
43
43
  )
44
44
 
45
+ SERVER_BACKGROUND_PROCESSING_DISABLED = (
46
+ os.getenv("DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED") is not None
47
+ )
48
+ SERVER_BACKGROUND_PROCESSING_ENABLED = not SERVER_BACKGROUND_PROCESSING_DISABLED
49
+
45
50
  SERVER_EXECUTOR_MAX_WORKERS = int(os.getenv("DSTACK_SERVER_EXECUTOR_MAX_WORKERS", 128))
46
51
 
47
52
  MAX_OFFERS_TRIED = int(os.getenv("DSTACK_SERVER_MAX_OFFERS_TRIED", 25))
@@ -113,5 +118,5 @@ SERVER_PROFILING_ENABLED = os.getenv("DSTACK_SERVER_PROFILING_ENABLED") is not N
113
118
 
114
119
  UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_UPDATE_DEFAULT_PROJECT") is not None
115
120
  DO_NOT_UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_DO_NOT_UPDATE_DEFAULT_PROJECT") is not None
116
- SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE", None) is not None
117
- ENABLE_PROMETHEUS_METRICS = os.getenv("DSTACK_ENABLE_PROMETHEUS_METRICS", None) is not None
121
+ SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE") is not None
122
+ ENABLE_PROMETHEUS_METRICS = os.getenv("DSTACK_ENABLE_PROMETHEUS_METRICS") is not None