dstack 0.19.18__py3-none-any.whl → 0.19.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (74) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +99 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
  4. dstack/_internal/core/backends/oci/resources.py +5 -5
  5. dstack/_internal/core/compatibility/runs.py +12 -1
  6. dstack/_internal/core/compatibility/volumes.py +2 -0
  7. dstack/_internal/core/models/common.py +38 -2
  8. dstack/_internal/core/models/configurations.py +9 -1
  9. dstack/_internal/core/models/fleets.py +2 -1
  10. dstack/_internal/core/models/profiles.py +8 -5
  11. dstack/_internal/core/models/resources.py +15 -8
  12. dstack/_internal/core/models/runs.py +41 -138
  13. dstack/_internal/core/models/volumes.py +14 -0
  14. dstack/_internal/core/services/diff.py +30 -10
  15. dstack/_internal/core/services/ssh/attach.py +2 -0
  16. dstack/_internal/server/app.py +17 -9
  17. dstack/_internal/server/background/__init__.py +5 -3
  18. dstack/_internal/server/background/tasks/process_gateways.py +46 -28
  19. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  20. dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
  21. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  22. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  23. dstack/_internal/server/models.py +1 -0
  24. dstack/_internal/server/routers/backends.py +23 -16
  25. dstack/_internal/server/routers/files.py +7 -6
  26. dstack/_internal/server/routers/fleets.py +47 -36
  27. dstack/_internal/server/routers/gateways.py +27 -18
  28. dstack/_internal/server/routers/instances.py +18 -13
  29. dstack/_internal/server/routers/logs.py +7 -3
  30. dstack/_internal/server/routers/metrics.py +14 -8
  31. dstack/_internal/server/routers/projects.py +33 -22
  32. dstack/_internal/server/routers/repos.py +7 -6
  33. dstack/_internal/server/routers/runs.py +49 -28
  34. dstack/_internal/server/routers/secrets.py +20 -15
  35. dstack/_internal/server/routers/server.py +7 -4
  36. dstack/_internal/server/routers/users.py +22 -19
  37. dstack/_internal/server/routers/volumes.py +34 -25
  38. dstack/_internal/server/schemas/logs.py +3 -11
  39. dstack/_internal/server/schemas/runs.py +17 -5
  40. dstack/_internal/server/services/fleets.py +354 -72
  41. dstack/_internal/server/services/gateways/__init__.py +13 -4
  42. dstack/_internal/server/services/gateways/client.py +5 -3
  43. dstack/_internal/server/services/instances.py +8 -0
  44. dstack/_internal/server/services/jobs/__init__.py +45 -0
  45. dstack/_internal/server/services/jobs/configurators/base.py +7 -0
  46. dstack/_internal/server/services/locking.py +3 -1
  47. dstack/_internal/server/services/logging.py +4 -2
  48. dstack/_internal/server/services/logs/__init__.py +15 -2
  49. dstack/_internal/server/services/logs/aws.py +47 -7
  50. dstack/_internal/server/services/logs/filelog.py +148 -32
  51. dstack/_internal/server/services/logs/gcp.py +3 -5
  52. dstack/_internal/server/services/prometheus/custom_metrics.py +20 -0
  53. dstack/_internal/server/services/proxy/repo.py +4 -1
  54. dstack/_internal/server/services/runs.py +115 -32
  55. dstack/_internal/server/services/services/__init__.py +2 -1
  56. dstack/_internal/server/services/users.py +3 -1
  57. dstack/_internal/server/services/volumes.py +13 -0
  58. dstack/_internal/server/settings.py +7 -2
  59. dstack/_internal/server/statics/index.html +1 -1
  60. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-39a767528976f8078166.js} +11 -30
  61. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-39a767528976f8078166.js.map} +1 -1
  62. dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
  63. dstack/_internal/server/testing/common.py +41 -5
  64. dstack/_internal/server/utils/routers.py +31 -8
  65. dstack/_internal/utils/common.py +10 -21
  66. dstack/_internal/utils/json_utils.py +54 -0
  67. dstack/api/_public/runs.py +13 -2
  68. dstack/api/server/_runs.py +12 -2
  69. dstack/version.py +1 -1
  70. {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/METADATA +7 -5
  71. {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/RECORD +74 -71
  72. {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/WHEEL +0 -0
  73. {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/entry_points.txt +0 -0
  74. {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/licenses/LICENSE.md +0 -0
@@ -7,9 +7,9 @@ from pydantic import parse_obj_as
7
7
 
8
8
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
9
9
  from dstack._internal.core.errors import GatewayError
10
- from dstack._internal.core.models.configurations import RateLimit
10
+ from dstack._internal.core.models.configurations import RateLimit, ServiceConfiguration
11
11
  from dstack._internal.core.models.instances import SSHConnectionParams
12
- from dstack._internal.core.models.runs import JobSubmission, Run
12
+ from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port
13
13
  from dstack._internal.proxy.gateway.schemas.stats import ServiceStats
14
14
  from dstack._internal.server import settings
15
15
 
@@ -80,13 +80,15 @@ class GatewayClient:
80
80
  async def register_replica(
81
81
  self,
82
82
  run: Run,
83
+ job_spec: JobSpec,
83
84
  job_submission: JobSubmission,
84
85
  ssh_head_proxy: Optional[SSHConnectionParams],
85
86
  ssh_head_proxy_private_key: Optional[str],
86
87
  ):
88
+ assert isinstance(run.run_spec.configuration, ServiceConfiguration)
87
89
  payload = {
88
90
  "job_id": job_submission.id.hex,
89
- "app_port": run.run_spec.configuration.port.container_port,
91
+ "app_port": get_service_port(job_spec, run.run_spec.configuration),
90
92
  "ssh_head_proxy": ssh_head_proxy.dict() if ssh_head_proxy is not None else None,
91
93
  "ssh_head_proxy_private_key": ssh_head_proxy_private_key,
92
94
  }
@@ -106,6 +106,14 @@ def get_instance_requirements(instance_model: InstanceModel) -> Requirements:
106
106
  return Requirements.__response__.parse_raw(instance_model.requirements)
107
107
 
108
108
 
109
+ def get_instance_remote_connection_info(
110
+ instance_model: InstanceModel,
111
+ ) -> Optional[RemoteConnectionInfo]:
112
+ if instance_model.remote_connection_info is None:
113
+ return None
114
+ return RemoteConnectionInfo.__response__.parse_raw(instance_model.remote_connection_info)
115
+
116
+
109
117
  def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, Optional[str]]:
110
118
  """
111
119
  Returns a pair of SSH private keys: host key and optional proxy jump key.
@@ -134,6 +134,8 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
134
134
  finished_at = None
135
135
  if job_model.status.is_finished():
136
136
  finished_at = last_processed_at
137
+ status_message = _get_job_status_message(job_model)
138
+ error = _get_job_error(job_model)
137
139
  return JobSubmission(
138
140
  id=job_model.id,
139
141
  submission_num=job_model.submission_num,
@@ -143,11 +145,13 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
143
145
  finished_at=finished_at,
144
146
  inactivity_secs=job_model.inactivity_secs,
145
147
  status=job_model.status,
148
+ status_message=status_message,
146
149
  termination_reason=job_model.termination_reason,
147
150
  termination_reason_message=job_model.termination_reason_message,
148
151
  exit_status=job_model.exit_status,
149
152
  job_provisioning_data=job_provisioning_data,
150
153
  job_runtime_data=get_job_runtime_data(job_model),
154
+ error=error,
151
155
  )
152
156
 
153
157
 
@@ -289,6 +293,19 @@ async def process_terminating_job(
289
293
  # so that stuck volumes don't prevent the instance from terminating.
290
294
  job_model.instance_id = None
291
295
  instance_model.last_job_processed_at = common.get_current_datetime()
296
+
297
+ volume_names = (
298
+ jrd.volume_names
299
+ if jrd and jrd.volume_names
300
+ else [va.volume.name for va in instance_model.volume_attachments]
301
+ )
302
+ if volume_names:
303
+ volumes = await list_project_volume_models(
304
+ session=session, project=instance_model.project, names=volume_names
305
+ )
306
+ for volume in volumes:
307
+ volume.last_job_processed_at = common.get_current_datetime()
308
+
292
309
  logger.info(
293
310
  "%s: instance '%s' has been released, new status is %s",
294
311
  fmt(job_model),
@@ -693,3 +710,31 @@ def _get_job_mount_point_attached_volume(
693
710
  continue
694
711
  return volume
695
712
  raise ServerClientError("Failed to find an eligible volume for the mount point")
713
+
714
+
715
+ def _get_job_status_message(job_model: JobModel) -> str:
716
+ if job_model.status == JobStatus.DONE:
717
+ return "exited (0)"
718
+ elif job_model.status == JobStatus.FAILED:
719
+ if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
720
+ return f"exited ({job_model.exit_status})"
721
+ elif (
722
+ job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
723
+ ):
724
+ return "no offers"
725
+ elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
726
+ return "interrupted"
727
+ else:
728
+ return "error"
729
+ elif job_model.status == JobStatus.TERMINATED:
730
+ if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
731
+ return "stopped"
732
+ elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
733
+ return "aborted"
734
+ return job_model.status.value
735
+
736
+
737
+ def _get_job_error(job_model: JobModel) -> Optional[str]:
738
+ if job_model.termination_reason is None:
739
+ return None
740
+ return job_model.termination_reason.to_error()
@@ -15,6 +15,7 @@ from dstack._internal.core.models.configurations import (
15
15
  PortMapping,
16
16
  PythonVersion,
17
17
  RunConfigurationType,
18
+ ServiceConfiguration,
18
19
  )
19
20
  from dstack._internal.core.models.profiles import (
20
21
  DEFAULT_STOP_DURATION,
@@ -153,6 +154,7 @@ class JobConfigurator(ABC):
153
154
  repo_data=self.run_spec.repo_data,
154
155
  repo_code_hash=self.run_spec.repo_code_hash,
155
156
  file_archives=self.run_spec.file_archives,
157
+ service_port=self._service_port(),
156
158
  )
157
159
  return job_spec
158
160
 
@@ -306,6 +308,11 @@ class JobConfigurator(ABC):
306
308
  )
307
309
  return self._job_ssh_key
308
310
 
311
+ def _service_port(self) -> Optional[int]:
312
+ if isinstance(self.run_spec.configuration, ServiceConfiguration):
313
+ return self.run_spec.configuration.port.container_port
314
+ return None
315
+
309
316
 
310
317
  def interpolate_job_volumes(
311
318
  run_volumes: List[Union[MountPoint, str]],
@@ -172,7 +172,7 @@ async def _wait_to_lock_many(
172
172
  The keys must be sorted to prevent deadlock.
173
173
  """
174
174
  left_to_lock = keys.copy()
175
- while len(left_to_lock) > 0:
175
+ while True:
176
176
  async with lock:
177
177
  locked_now_num = 0
178
178
  for key in left_to_lock:
@@ -182,4 +182,6 @@ async def _wait_to_lock_many(
182
182
  locked.add(key)
183
183
  locked_now_num += 1
184
184
  left_to_lock = left_to_lock[locked_now_num:]
185
+ if not left_to_lock:
186
+ return
185
187
  await asyncio.sleep(delay)
@@ -1,12 +1,14 @@
1
1
  from typing import Union
2
2
 
3
- from dstack._internal.server.models import JobModel, RunModel
3
+ from dstack._internal.server.models import GatewayModel, JobModel, RunModel
4
4
 
5
5
 
6
- def fmt(model: Union[RunModel, JobModel]) -> str:
6
+ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
7
7
  """Consistent string representation of a model for logging."""
8
8
  if isinstance(model, RunModel):
9
9
  return f"run({model.id.hex[:6]}){model.run_name}"
10
10
  if isinstance(model, JobModel):
11
11
  return f"job({model.id.hex[:6]}){model.job_name}"
12
+ if isinstance(model, GatewayModel):
13
+ return f"gateway({model.id.hex[:6]}){model.name}"
12
14
  return str(model)
@@ -8,7 +8,11 @@ from dstack._internal.server.models import ProjectModel
8
8
  from dstack._internal.server.schemas.logs import PollLogsRequest
9
9
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
10
10
  from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
11
- from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
11
+ from dstack._internal.server.services.logs.base import (
12
+ LogStorage,
13
+ LogStorageError,
14
+ b64encode_raw_message,
15
+ )
12
16
  from dstack._internal.server.services.logs.filelog import FileLogStorage
13
17
  from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
14
18
  from dstack._internal.utils.common import run_async
@@ -75,4 +79,13 @@ def write_logs(
75
79
 
76
80
 
77
81
  async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
78
- return await run_async(get_log_storage().poll_logs, project=project, request=request)
82
+ job_submission_logs = await run_async(
83
+ get_log_storage().poll_logs, project=project, request=request
84
+ )
85
+ # Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility.
86
+ # Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI.
87
+ # We live with that.
88
+ # TODO: Drop base64 encoding in 0.20.
89
+ for log_event in job_submission_logs.logs:
90
+ log_event.message = b64encode_raw_message(log_event.message.encode())
91
+ return job_submission_logs
@@ -17,7 +17,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
17
17
  from dstack._internal.server.services.logs.base import (
18
18
  LogStorage,
19
19
  LogStorageError,
20
- b64encode_raw_message,
21
20
  datetime_to_unix_time_ms,
22
21
  unix_time_ms_to_datetime,
23
22
  )
@@ -56,6 +55,8 @@ class CloudWatchLogStorage(LogStorage):
56
55
  PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
57
56
  # "None of the log events in the batch can be more than 2 hours in the future."
58
57
  FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
58
+ # Maximum number of retries when polling for log events to skip empty pages.
59
+ MAX_RETRIES = 10
59
60
 
60
61
  def __init__(self, *, group: str, region: Optional[str] = None) -> None:
61
62
  with self._wrap_boto_errors():
@@ -81,7 +82,7 @@ class CloudWatchLogStorage(LogStorage):
81
82
  next_token: Optional[str] = None
82
83
  with self._wrap_boto_errors():
83
84
  try:
84
- cw_events, next_token = self._get_log_events(stream, request)
85
+ cw_events, next_token = self._get_log_events_with_retry(stream, request)
85
86
  except botocore.exceptions.ClientError as e:
86
87
  if not self._is_resource_not_found_exception(e):
87
88
  raise
@@ -102,7 +103,47 @@ class CloudWatchLogStorage(LogStorage):
102
103
  )
103
104
  for cw_event in cw_events
104
105
  ]
105
- return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
106
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
107
+
108
+ def _get_log_events_with_retry(
109
+ self, stream: str, request: PollLogsRequest
110
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
111
+ current_request = request
112
+ previous_next_token = request.next_token
113
+
114
+ for attempt in range(self.MAX_RETRIES):
115
+ cw_events, next_token = self._get_log_events(stream, current_request)
116
+
117
+ if cw_events:
118
+ return cw_events, next_token
119
+
120
+ if not next_token or next_token == previous_next_token:
121
+ return [], None
122
+
123
+ previous_next_token = next_token
124
+ current_request = PollLogsRequest(
125
+ run_name=request.run_name,
126
+ job_submission_id=request.job_submission_id,
127
+ start_time=request.start_time,
128
+ end_time=request.end_time,
129
+ descending=request.descending,
130
+ next_token=next_token,
131
+ limit=request.limit,
132
+ diagnose=request.diagnose,
133
+ )
134
+
135
+ if not request.descending:
136
+ logger.debug(
137
+ "Stream %s: exhausted %d retries without finding logs, returning empty response",
138
+ stream,
139
+ self.MAX_RETRIES,
140
+ )
141
+ # Only return the next token after exhausting retries if going descending—
142
+ # AWS CloudWatch guarantees more logs in that case. In ascending mode,
143
+ # next token is always returned, even if no logs remain.
144
+ # So descending works reliably; ascending has limits if gaps are too large.
145
+ # In the future, UI/CLI should handle retries, and we can return next token for ascending too.
146
+ return [], next_token if request.descending else None
106
147
 
107
148
  def _get_log_events(
108
149
  self, stream: str, request: PollLogsRequest
@@ -116,7 +157,7 @@ class CloudWatchLogStorage(LogStorage):
116
157
  }
117
158
 
118
159
  if request.start_time:
119
- parameters["startTime"] = datetime_to_unix_time_ms(request.start_time) + 1
160
+ parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
120
161
 
121
162
  if request.end_time:
122
163
  parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
@@ -238,8 +279,7 @@ class CloudWatchLogStorage(LogStorage):
238
279
  skipped_future_events += 1
239
280
  continue
240
281
  cw_event = self._runner_log_event_to_cloudwatch_event(event)
241
- # as message is base64-encoded, length in bytes = length in code points.
242
- message_size = len(cw_event["message"]) + self.MESSAGE_OVERHEAD_SIZE
282
+ message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
243
283
  if message_size > self.MESSAGE_MAX_SIZE:
244
284
  # we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
245
285
  # which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
@@ -271,7 +311,7 @@ class CloudWatchLogStorage(LogStorage):
271
311
  ) -> _CloudWatchLogEvent:
272
312
  return {
273
313
  "timestamp": runner_log_event.timestamp,
274
- "message": b64encode_raw_message(runner_log_event.message),
314
+ "message": runner_log_event.message.decode(errors="replace"),
275
315
  }
276
316
 
277
317
  @contextmanager
@@ -1,7 +1,9 @@
1
+ import os
1
2
  from pathlib import Path
2
- from typing import List, Union
3
+ from typing import Generator, List, Optional, Tuple, Union
3
4
  from uuid import UUID
4
5
 
6
+ from dstack._internal.core.errors import ServerClientError
5
7
  from dstack._internal.core.models.logs import (
6
8
  JobSubmissionLogs,
7
9
  LogEvent,
@@ -14,8 +16,6 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
14
16
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
15
17
  from dstack._internal.server.services.logs.base import (
16
18
  LogStorage,
17
- LogStorageError,
18
- b64encode_raw_message,
19
19
  unix_time_ms_to_datetime,
20
20
  )
21
21
 
@@ -30,9 +30,6 @@ class FileLogStorage(LogStorage):
30
30
  self.root = Path(root)
31
31
 
32
32
  def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
33
- if request.descending:
34
- raise LogStorageError("descending: true is not supported")
35
-
36
33
  log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
37
34
  log_file_path = self._get_log_file_path(
38
35
  project_name=project.name,
@@ -41,18 +38,17 @@ class FileLogStorage(LogStorage):
41
38
  producer=log_producer,
42
39
  )
43
40
 
41
+ if request.descending:
42
+ return self._poll_logs_descending(log_file_path, request)
43
+ else:
44
+ return self._poll_logs_ascending(log_file_path, request)
45
+
46
+ def _poll_logs_ascending(
47
+ self, log_file_path: Path, request: PollLogsRequest
48
+ ) -> JobSubmissionLogs:
44
49
  start_line = 0
45
50
  if request.next_token:
46
- try:
47
- start_line = int(request.next_token)
48
- if start_line < 0:
49
- raise LogStorageError(
50
- f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
51
- )
52
- except ValueError:
53
- raise LogStorageError(
54
- f"Invalid next_token: {request.next_token}. Must be a valid integer."
55
- )
51
+ start_line = self._next_token(request)
56
52
 
57
53
  logs = []
58
54
  next_token = None
@@ -60,34 +56,140 @@ class FileLogStorage(LogStorage):
60
56
 
61
57
  try:
62
58
  with open(log_file_path) as f:
63
- lines = f.readlines()
59
+ # Skip to start_line if needed
60
+ for _ in range(start_line):
61
+ if f.readline() == "":
62
+ # File is shorter than start_line
63
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
64
+ current_line += 1
65
+
66
+ # Read lines one by one
67
+ while True:
68
+ line = f.readline()
69
+ if line == "": # EOF
70
+ break
64
71
 
65
- for i, line in enumerate(lines):
66
- if current_line < start_line:
67
72
  current_line += 1
68
- continue
69
73
 
70
- log_event = LogEvent.__response__.parse_raw(line)
71
- current_line += 1
74
+ try:
75
+ log_event = LogEvent.__response__.parse_raw(line)
76
+ except Exception:
77
+ # Skip malformed lines
78
+ continue
72
79
 
73
- if request.start_time and log_event.timestamp <= request.start_time:
80
+ if request.start_time and log_event.timestamp <= request.start_time:
81
+ continue
82
+ if request.end_time is not None and log_event.timestamp >= request.end_time:
83
+ break
84
+
85
+ logs.append(log_event)
86
+
87
+ if len(logs) >= request.limit:
88
+ # Check if there are more lines to read
89
+ if f.readline() != "":
90
+ next_token = str(current_line)
91
+ break
92
+ except FileNotFoundError:
93
+ pass
94
+
95
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
96
+
97
+ def _poll_logs_descending(
98
+ self, log_file_path: Path, request: PollLogsRequest
99
+ ) -> JobSubmissionLogs:
100
+ start_offset = self._next_token(request)
101
+
102
+ candidate_logs = []
103
+
104
+ try:
105
+ line_generator = self._read_lines_reversed(log_file_path, start_offset)
106
+
107
+ for line_bytes, line_start_offset in line_generator:
108
+ try:
109
+ line_str = line_bytes.decode("utf-8")
110
+ log_event = LogEvent.__response__.parse_raw(line_str)
111
+ except Exception:
112
+ continue # Skip malformed lines
113
+
114
+ if request.end_time is not None and log_event.timestamp > request.end_time:
74
115
  continue
75
- if request.end_time is not None and log_event.timestamp >= request.end_time:
116
+ if request.start_time and log_event.timestamp <= request.start_time:
76
117
  break
77
118
 
78
- logs.append(log_event)
119
+ candidate_logs.append((log_event, line_start_offset))
79
120
 
80
- if len(logs) >= request.limit:
81
- # Only set next_token if there are more lines to read
82
- if current_line < len(lines):
83
- next_token = str(current_line)
121
+ if len(candidate_logs) > request.limit:
84
122
  break
123
+ except FileNotFoundError:
124
+ return JobSubmissionLogs(logs=[], next_token=None)
85
125
 
86
- except IOError as e:
87
- raise LogStorageError(f"Failed to read log file {log_file_path}: {e}")
126
+ logs = [log for log, offset in candidate_logs[: request.limit]]
127
+ next_token = None
128
+ if len(candidate_logs) > request.limit:
129
+ # We fetched one more than the limit, so there are more pages.
130
+ # The next token should point to the start of the last log we are returning.
131
+ _last_log_event, last_log_offset = candidate_logs[request.limit - 1]
132
+ next_token = str(last_log_offset)
88
133
 
89
134
  return JobSubmissionLogs(logs=logs, next_token=next_token)
90
135
 
136
+ @staticmethod
137
+ def _read_lines_reversed(
138
+ filepath: Path, start_offset: Optional[int] = None, chunk_size: int = 8192
139
+ ) -> Generator[Tuple[bytes, int], None, None]:
140
+ """
141
+ A generator that yields lines from a file in reverse order, along with the byte
142
+ offset of the start of each line. This is memory-efficient for large files.
143
+ """
144
+ with open(filepath, "rb") as f:
145
+ f.seek(0, os.SEEK_END)
146
+ file_size = f.tell()
147
+ cursor = file_size
148
+
149
+ # If a start_offset is provided, optimize by starting the read
150
+ # from a more specific location instead of the end of the file.
151
+ if start_offset is not None and start_offset < file_size:
152
+ # To get the full content of the line that straddles the offset,
153
+ # we need to find its end (the next newline character).
154
+ f.seek(start_offset)
155
+ chunk = f.read(chunk_size)
156
+ newline_pos = chunk.find(b"\n")
157
+ if newline_pos != -1:
158
+ # Found the end of the line. The cursor for reverse reading
159
+ # should start from this point to include the full line.
160
+ cursor = start_offset + newline_pos + 1
161
+ else:
162
+ # No newline found, which means the rest of the file is one line.
163
+ # The default cursor pointing to file_size is correct.
164
+ pass
165
+
166
+ buffer = b""
167
+
168
+ while cursor > 0:
169
+ seek_pos = max(0, cursor - chunk_size)
170
+ amount_to_read = cursor - seek_pos
171
+ f.seek(seek_pos)
172
+ chunk = f.read(amount_to_read)
173
+ cursor = seek_pos
174
+
175
+ buffer = chunk + buffer
176
+
177
+ while b"\n" in buffer:
178
+ newline_pos = buffer.rfind(b"\n")
179
+ line = buffer[newline_pos + 1 :]
180
+ line_start_offset = cursor + newline_pos + 1
181
+
182
+ # Skip lines that start at or after the start_offset
183
+ if start_offset is None or line_start_offset < start_offset:
184
+ yield line, line_start_offset
185
+
186
+ buffer = buffer[:newline_pos]
187
+
188
+ # The remaining buffer is the first line of the file.
189
+ # Only yield it if we're not using start_offset or if it starts before start_offset
190
+ if buffer and (start_offset is None or 0 < start_offset):
191
+ yield buffer, 0
192
+
91
193
  def write_logs(
92
194
  self,
93
195
  project: ProjectModel,
@@ -140,5 +242,19 @@ class FileLogStorage(LogStorage):
140
242
  return LogEvent(
141
243
  timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp),
142
244
  log_source=LogEventSource.STDOUT,
143
- message=b64encode_raw_message(runner_log_event.message),
245
+ message=runner_log_event.message.decode(errors="replace"),
144
246
  )
247
+
248
+ def _next_token(self, request: PollLogsRequest) -> Optional[int]:
249
+ next_token = request.next_token
250
+ if next_token is None:
251
+ return None
252
+ try:
253
+ value = int(next_token)
254
+ if value < 0:
255
+ raise ValueError("Offset must be non-negative")
256
+ return value
257
+ except (ValueError, TypeError):
258
+ raise ServerClientError(
259
+ f"Invalid next_token: {next_token}. Must be a non-negative integer."
260
+ )
@@ -14,7 +14,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
14
14
  from dstack._internal.server.services.logs.base import (
15
15
  LogStorage,
16
16
  LogStorageError,
17
- b64encode_raw_message,
18
17
  unix_time_ms_to_datetime,
19
18
  )
20
19
  from dstack._internal.utils.common import batched
@@ -137,15 +136,14 @@ class GCPLogStorage(LogStorage):
137
136
  with self.logger.batch() as batcher:
138
137
  for batch in batched(logs, self.MAX_BATCH_SIZE):
139
138
  for log in batch:
140
- message = b64encode_raw_message(log.message)
139
+ message = log.message.decode(errors="replace")
141
140
  timestamp = unix_time_ms_to_datetime(log.timestamp)
142
- # as message is base64-encoded, length in bytes = length in code points
143
- if len(message) > self.MAX_RUNNER_MESSAGE_SIZE:
141
+ if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
144
142
  logger.error(
145
143
  "Stream %s: skipping event at %s, message exceeds max size: %d > %d",
146
144
  stream_name,
147
145
  timestamp.isoformat(),
148
- len(message),
146
+ len(log.message),
149
147
  self.MAX_RUNNER_MESSAGE_SIZE,
150
148
  )
151
149
  continue
@@ -1,4 +1,5 @@
1
1
  import itertools
2
+ import json
2
3
  from collections import defaultdict
3
4
  from collections.abc import Generator, Iterable
4
5
  from datetime import timezone
@@ -177,6 +178,19 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
177
178
  metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
178
179
  metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
179
180
  metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
181
+ if gpus:
182
+ gpu_memory_total = gpus[0].memory_mib * 1024 * 1024
183
+ for gpu_num, (gpu_util, gpu_memory_usage) in enumerate(
184
+ zip(
185
+ json.loads(jmp.gpus_util_percent),
186
+ json.loads(jmp.gpus_memory_usage_bytes),
187
+ )
188
+ ):
189
+ gpu_labels = labels.copy()
190
+ gpu_labels["dstack_gpu_num"] = gpu_num
191
+ metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100)
192
+ metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total)
193
+ metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage)
180
194
  jpm = job_prometheus_metrics.get(job.id)
181
195
  if jpm is not None:
182
196
  for metric in text_string_to_metric_families(jpm.text):
@@ -202,6 +216,9 @@ _JOB_CPU_TIME = "dstack_job_cpu_time_seconds_total"
202
216
  _JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
203
217
  _JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
204
218
  _JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
219
+ _JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio"
220
+ _JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes"
221
+ _JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes"
205
222
 
206
223
 
207
224
  class _Metrics(dict[str, Metric]):
@@ -259,6 +276,9 @@ class _JobMetrics(_Metrics):
259
276
  (_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
260
277
  (_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
261
278
  (_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
279
+ (_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"),
280
+ (_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"),
281
+ (_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"),
262
282
  ]
263
283
 
264
284
 
@@ -12,10 +12,12 @@ from dstack._internal.core.models.configurations import ServiceConfiguration
12
12
  from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
13
13
  from dstack._internal.core.models.runs import (
14
14
  JobProvisioningData,
15
+ JobSpec,
15
16
  JobStatus,
16
17
  RunSpec,
17
18
  RunStatus,
18
19
  ServiceSpec,
20
+ get_service_port,
19
21
  )
20
22
  from dstack._internal.core.models.services import AnyModel
21
23
  from dstack._internal.proxy.lib.models import (
@@ -97,9 +99,10 @@ class ServerProxyRepo(BaseProxyRepo):
97
99
  if rci.ssh_proxy is not None:
98
100
  ssh_head_proxy = rci.ssh_proxy
99
101
  ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
102
+ job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
100
103
  replica = Replica(
101
104
  id=job.id.hex,
102
- app_port=run_spec.configuration.port.container_port,
105
+ app_port=get_service_port(job_spec, run_spec.configuration),
103
106
  ssh_destination=ssh_destination,
104
107
  ssh_port=ssh_port,
105
108
  ssh_proxy=ssh_proxy,