dstack 0.19.17__py3-none-any.whl → 0.19.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (86) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +111 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/backends/aws/compute.py +237 -18
  4. dstack/_internal/core/backends/base/compute.py +20 -2
  5. dstack/_internal/core/backends/cudo/compute.py +23 -9
  6. dstack/_internal/core/backends/gcp/compute.py +13 -7
  7. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  8. dstack/_internal/core/compatibility/fleets.py +12 -11
  9. dstack/_internal/core/compatibility/gateways.py +9 -8
  10. dstack/_internal/core/compatibility/logs.py +4 -3
  11. dstack/_internal/core/compatibility/runs.py +29 -21
  12. dstack/_internal/core/compatibility/volumes.py +11 -8
  13. dstack/_internal/core/errors.py +4 -0
  14. dstack/_internal/core/models/common.py +45 -2
  15. dstack/_internal/core/models/configurations.py +9 -1
  16. dstack/_internal/core/models/fleets.py +2 -1
  17. dstack/_internal/core/models/profiles.py +8 -5
  18. dstack/_internal/core/models/resources.py +15 -8
  19. dstack/_internal/core/models/runs.py +41 -138
  20. dstack/_internal/core/models/volumes.py +14 -0
  21. dstack/_internal/core/services/diff.py +56 -3
  22. dstack/_internal/core/services/ssh/attach.py +2 -0
  23. dstack/_internal/server/app.py +37 -9
  24. dstack/_internal/server/background/__init__.py +66 -40
  25. dstack/_internal/server/background/tasks/process_fleets.py +19 -3
  26. dstack/_internal/server/background/tasks/process_gateways.py +47 -29
  27. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  28. dstack/_internal/server/background/tasks/process_instances.py +13 -2
  29. dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
  30. dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
  31. dstack/_internal/server/background/tasks/process_runs.py +8 -4
  32. dstack/_internal/server/background/tasks/process_submitted_jobs.py +38 -7
  33. dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
  34. dstack/_internal/server/background/tasks/process_volumes.py +2 -2
  35. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  36. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  37. dstack/_internal/server/models.py +1 -0
  38. dstack/_internal/server/routers/backends.py +23 -16
  39. dstack/_internal/server/routers/files.py +7 -6
  40. dstack/_internal/server/routers/fleets.py +47 -36
  41. dstack/_internal/server/routers/gateways.py +27 -18
  42. dstack/_internal/server/routers/instances.py +18 -13
  43. dstack/_internal/server/routers/logs.py +7 -3
  44. dstack/_internal/server/routers/metrics.py +14 -8
  45. dstack/_internal/server/routers/projects.py +33 -22
  46. dstack/_internal/server/routers/repos.py +7 -6
  47. dstack/_internal/server/routers/runs.py +49 -28
  48. dstack/_internal/server/routers/secrets.py +20 -15
  49. dstack/_internal/server/routers/server.py +7 -4
  50. dstack/_internal/server/routers/users.py +22 -19
  51. dstack/_internal/server/routers/volumes.py +34 -25
  52. dstack/_internal/server/schemas/logs.py +2 -2
  53. dstack/_internal/server/schemas/runs.py +17 -5
  54. dstack/_internal/server/services/fleets.py +358 -75
  55. dstack/_internal/server/services/gateways/__init__.py +17 -6
  56. dstack/_internal/server/services/gateways/client.py +5 -3
  57. dstack/_internal/server/services/instances.py +8 -0
  58. dstack/_internal/server/services/jobs/__init__.py +45 -0
  59. dstack/_internal/server/services/jobs/configurators/base.py +12 -1
  60. dstack/_internal/server/services/locking.py +104 -13
  61. dstack/_internal/server/services/logging.py +4 -2
  62. dstack/_internal/server/services/logs/__init__.py +15 -2
  63. dstack/_internal/server/services/logs/aws.py +2 -4
  64. dstack/_internal/server/services/logs/filelog.py +33 -27
  65. dstack/_internal/server/services/logs/gcp.py +3 -5
  66. dstack/_internal/server/services/proxy/repo.py +4 -1
  67. dstack/_internal/server/services/runs.py +139 -72
  68. dstack/_internal/server/services/services/__init__.py +2 -1
  69. dstack/_internal/server/services/users.py +3 -1
  70. dstack/_internal/server/services/volumes.py +15 -2
  71. dstack/_internal/server/settings.py +25 -6
  72. dstack/_internal/server/statics/index.html +1 -1
  73. dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-64f8273740c4b52c18f5.js} +71 -67
  74. dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
  75. dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
  76. dstack/_internal/server/testing/common.py +48 -8
  77. dstack/_internal/server/utils/routers.py +31 -8
  78. dstack/_internal/utils/json_utils.py +54 -0
  79. dstack/api/_public/runs.py +13 -2
  80. dstack/api/server/_runs.py +12 -2
  81. dstack/version.py +1 -1
  82. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/METADATA +17 -14
  83. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/RECORD +86 -83
  84. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
  85. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
  86. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -7,9 +7,9 @@ from pydantic import parse_obj_as
7
7
 
8
8
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
9
9
  from dstack._internal.core.errors import GatewayError
10
- from dstack._internal.core.models.configurations import RateLimit
10
+ from dstack._internal.core.models.configurations import RateLimit, ServiceConfiguration
11
11
  from dstack._internal.core.models.instances import SSHConnectionParams
12
- from dstack._internal.core.models.runs import JobSubmission, Run
12
+ from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port
13
13
  from dstack._internal.proxy.gateway.schemas.stats import ServiceStats
14
14
  from dstack._internal.server import settings
15
15
 
@@ -80,13 +80,15 @@ class GatewayClient:
80
80
  async def register_replica(
81
81
  self,
82
82
  run: Run,
83
+ job_spec: JobSpec,
83
84
  job_submission: JobSubmission,
84
85
  ssh_head_proxy: Optional[SSHConnectionParams],
85
86
  ssh_head_proxy_private_key: Optional[str],
86
87
  ):
88
+ assert isinstance(run.run_spec.configuration, ServiceConfiguration)
87
89
  payload = {
88
90
  "job_id": job_submission.id.hex,
89
- "app_port": run.run_spec.configuration.port.container_port,
91
+ "app_port": get_service_port(job_spec, run.run_spec.configuration),
90
92
  "ssh_head_proxy": ssh_head_proxy.dict() if ssh_head_proxy is not None else None,
91
93
  "ssh_head_proxy_private_key": ssh_head_proxy_private_key,
92
94
  }
@@ -106,6 +106,14 @@ def get_instance_requirements(instance_model: InstanceModel) -> Requirements:
106
106
  return Requirements.__response__.parse_raw(instance_model.requirements)
107
107
 
108
108
 
109
+ def get_instance_remote_connection_info(
110
+ instance_model: InstanceModel,
111
+ ) -> Optional[RemoteConnectionInfo]:
112
+ if instance_model.remote_connection_info is None:
113
+ return None
114
+ return RemoteConnectionInfo.__response__.parse_raw(instance_model.remote_connection_info)
115
+
116
+
109
117
  def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, Optional[str]]:
110
118
  """
111
119
  Returns a pair of SSH private keys: host key and optional proxy jump key.
@@ -134,6 +134,8 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
134
134
  finished_at = None
135
135
  if job_model.status.is_finished():
136
136
  finished_at = last_processed_at
137
+ status_message = _get_job_status_message(job_model)
138
+ error = _get_job_error(job_model)
137
139
  return JobSubmission(
138
140
  id=job_model.id,
139
141
  submission_num=job_model.submission_num,
@@ -143,11 +145,13 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
143
145
  finished_at=finished_at,
144
146
  inactivity_secs=job_model.inactivity_secs,
145
147
  status=job_model.status,
148
+ status_message=status_message,
146
149
  termination_reason=job_model.termination_reason,
147
150
  termination_reason_message=job_model.termination_reason_message,
148
151
  exit_status=job_model.exit_status,
149
152
  job_provisioning_data=job_provisioning_data,
150
153
  job_runtime_data=get_job_runtime_data(job_model),
154
+ error=error,
151
155
  )
152
156
 
153
157
 
@@ -289,6 +293,19 @@ async def process_terminating_job(
289
293
  # so that stuck volumes don't prevent the instance from terminating.
290
294
  job_model.instance_id = None
291
295
  instance_model.last_job_processed_at = common.get_current_datetime()
296
+
297
+ volume_names = (
298
+ jrd.volume_names
299
+ if jrd and jrd.volume_names
300
+ else [va.volume.name for va in instance_model.volume_attachments]
301
+ )
302
+ if volume_names:
303
+ volumes = await list_project_volume_models(
304
+ session=session, project=instance_model.project, names=volume_names
305
+ )
306
+ for volume in volumes:
307
+ volume.last_job_processed_at = common.get_current_datetime()
308
+
292
309
  logger.info(
293
310
  "%s: instance '%s' has been released, new status is %s",
294
311
  fmt(job_model),
@@ -693,3 +710,31 @@ def _get_job_mount_point_attached_volume(
693
710
  continue
694
711
  return volume
695
712
  raise ServerClientError("Failed to find an eligible volume for the mount point")
713
+
714
+
715
+ def _get_job_status_message(job_model: JobModel) -> str:
716
+ if job_model.status == JobStatus.DONE:
717
+ return "exited (0)"
718
+ elif job_model.status == JobStatus.FAILED:
719
+ if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
720
+ return f"exited ({job_model.exit_status})"
721
+ elif (
722
+ job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
723
+ ):
724
+ return "no offers"
725
+ elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
726
+ return "interrupted"
727
+ else:
728
+ return "error"
729
+ elif job_model.status == JobStatus.TERMINATED:
730
+ if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
731
+ return "stopped"
732
+ elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
733
+ return "aborted"
734
+ return job_model.status.value
735
+
736
+
737
+ def _get_job_error(job_model: JobModel) -> Optional[str]:
738
+ if job_model.termination_reason is None:
739
+ return None
740
+ return job_model.termination_reason.to_error()
@@ -1,5 +1,6 @@
1
1
  import shlex
2
2
  import sys
3
+ import threading
3
4
  from abc import ABC, abstractmethod
4
5
  from pathlib import PurePosixPath
5
6
  from typing import Dict, List, Optional, Union
@@ -14,6 +15,7 @@ from dstack._internal.core.models.configurations import (
14
15
  PortMapping,
15
16
  PythonVersion,
16
17
  RunConfigurationType,
18
+ ServiceConfiguration,
17
19
  )
18
20
  from dstack._internal.core.models.profiles import (
19
21
  DEFAULT_STOP_DURATION,
@@ -152,6 +154,7 @@ class JobConfigurator(ABC):
152
154
  repo_data=self.run_spec.repo_data,
153
155
  repo_code_hash=self.run_spec.repo_code_hash,
154
156
  file_archives=self.run_spec.file_archives,
157
+ service_port=self._service_port(),
155
158
  )
156
159
  return job_spec
157
160
 
@@ -305,6 +308,11 @@ class JobConfigurator(ABC):
305
308
  )
306
309
  return self._job_ssh_key
307
310
 
311
+ def _service_port(self) -> Optional[int]:
312
+ if isinstance(self.run_spec.configuration, ServiceConfiguration):
313
+ return self.run_spec.configuration.port.container_port
314
+ return None
315
+
308
316
 
309
317
  def interpolate_job_volumes(
310
318
  run_volumes: List[Union[MountPoint, str]],
@@ -354,7 +362,10 @@ def _join_shell_commands(commands: List[str]) -> str:
354
362
  return " && ".join(commands)
355
363
 
356
364
 
357
- @cached(TTLCache(maxsize=2048, ttl=80))
365
+ @cached(
366
+ cache=TTLCache(maxsize=2048, ttl=80),
367
+ lock=threading.Lock(),
368
+ )
358
369
  def _get_image_config(image: str, registry_auth: Optional[RegistryAuth]) -> ImageConfig:
359
370
  try:
360
371
  return get_image_config(image, registry_auth).config
@@ -1,8 +1,10 @@
1
1
  import asyncio
2
+ import collections.abc
2
3
  import hashlib
4
+ from abc import abstractmethod
3
5
  from asyncio import Lock
4
6
  from contextlib import asynccontextmanager
5
- from typing import AsyncGenerator, Dict, List, Set, Tuple, TypeVar, Union
7
+ from typing import AsyncGenerator, Iterable, Iterator, Protocol, TypeVar, Union
6
8
 
7
9
  from sqlalchemy import func, select
8
10
  from sqlalchemy.ext.asyncio import AsyncConnection, AsyncSession
@@ -10,23 +12,54 @@ from sqlalchemy.ext.asyncio import AsyncConnection, AsyncSession
10
12
  KeyT = TypeVar("KeyT")
11
13
 
12
14
 
13
- class ResourceLocker:
14
- def __init__(self):
15
- self.namespace_to_locks_map: Dict[str, Tuple[Lock, set]] = {}
15
+ class LocksetLock(Protocol):
16
+ async def acquire(self) -> bool: ...
17
+ def release(self) -> None: ...
18
+ async def __aenter__(self): ...
19
+ async def __aexit__(self, exc_type, exc, tb): ...
20
+
21
+
22
+ T = TypeVar("T")
23
+
16
24
 
17
- def get_lockset(self, namespace: str) -> Tuple[Lock, set]:
25
+ class Lockset(Protocol[T]):
26
+ def __contains__(self, item: T) -> bool: ...
27
+ def __iter__(self) -> Iterator[T]: ...
28
+ def __len__(self) -> int: ...
29
+ def add(self, item: T) -> None: ...
30
+ def discard(self, item: T) -> None: ...
31
+ def update(self, other: Iterable[T]) -> None: ...
32
+ def difference_update(self, other: Iterable[T]) -> None: ...
33
+
34
+
35
+ class ResourceLocker:
36
+ @abstractmethod
37
+ def get_lockset(self, namespace: str) -> tuple[LocksetLock, Lockset]:
18
38
  """
19
39
  Returns a lockset containing locked resources for in-memory locking.
20
40
  Also returns a lock that guards the lockset.
21
41
  """
22
- return self.namespace_to_locks_map.setdefault(namespace, (Lock(), set()))
42
+ pass
23
43
 
44
+ @abstractmethod
24
45
  @asynccontextmanager
25
- async def lock_ctx(self, namespace: str, keys: List[KeyT]):
46
+ async def lock_ctx(self, namespace: str, keys: list[KeyT]):
26
47
  """
27
48
  Acquires locks for all keys in namespace.
28
49
  The keys must be sorted to prevent deadlock.
29
50
  """
51
+ yield
52
+
53
+
54
+ class InMemoryResourceLocker(ResourceLocker):
55
+ def __init__(self):
56
+ self.namespace_to_locks_map: dict[str, tuple[Lock, set]] = {}
57
+
58
+ def get_lockset(self, namespace: str) -> tuple[Lock, set]:
59
+ return self.namespace_to_locks_map.setdefault(namespace, (Lock(), set()))
60
+
61
+ @asynccontextmanager
62
+ async def lock_ctx(self, namespace: str, keys: list[KeyT]):
30
63
  lock, lockset = self.get_lockset(namespace)
31
64
  try:
32
65
  await _wait_to_lock_many(lock, lockset, keys)
@@ -35,6 +68,56 @@ class ResourceLocker:
35
68
  lockset.difference_update(keys)
36
69
 
37
70
 
71
+ class DummyAsyncLock:
72
+ async def __aenter__(self):
73
+ pass
74
+
75
+ async def __aexit__(self, exc_type, exc, tb):
76
+ pass
77
+
78
+ async def acquire(self):
79
+ return True
80
+
81
+ def release(self):
82
+ pass
83
+
84
+
85
+ class DummySet(collections.abc.MutableSet):
86
+ def __contains__(self, item):
87
+ return False
88
+
89
+ def __iter__(self):
90
+ return iter(())
91
+
92
+ def __len__(self):
93
+ return 0
94
+
95
+ def add(self, value):
96
+ pass
97
+
98
+ def discard(self, value):
99
+ pass
100
+
101
+ def update(self, other):
102
+ pass
103
+
104
+ def difference_update(self, other):
105
+ pass
106
+
107
+
108
+ class DummyResourceLocker(ResourceLocker):
109
+ def __init__(self):
110
+ self.lock = DummyAsyncLock()
111
+ self.lockset = DummySet()
112
+
113
+ def get_lockset(self, namespace: str) -> tuple[DummyAsyncLock, DummySet]:
114
+ return self.lock, self.lockset
115
+
116
+ @asynccontextmanager
117
+ async def lock_ctx(self, namespace: str, keys: list[KeyT]):
118
+ yield
119
+
120
+
38
121
  def string_to_lock_id(s: str) -> int:
39
122
  return int(hashlib.sha256(s.encode()).hexdigest(), 16) % (2**63)
40
123
 
@@ -67,15 +150,21 @@ async def try_advisory_lock_ctx(
67
150
  await bind.execute(select(func.pg_advisory_unlock(string_to_lock_id(resource))))
68
151
 
69
152
 
70
- _locker = ResourceLocker()
153
+ _in_memory_locker = InMemoryResourceLocker()
154
+ _dummy_locker = DummyResourceLocker()
71
155
 
72
156
 
73
- def get_locker() -> ResourceLocker:
74
- return _locker
157
+ def get_locker(dialect_name: str) -> ResourceLocker:
158
+ if dialect_name == "sqlite":
159
+ return _in_memory_locker
160
+ # We could use an in-memory locker on Postgres
161
+ # but it can lead to unnecessary lock contention,
162
+ # so we use a dummy locker that does not take any locks.
163
+ return _dummy_locker
75
164
 
76
165
 
77
166
  async def _wait_to_lock_many(
78
- lock: asyncio.Lock, locked: Set[KeyT], keys: List[KeyT], *, delay: float = 0.1
167
+ lock: asyncio.Lock, locked: set[KeyT], keys: list[KeyT], *, delay: float = 0.1
79
168
  ):
80
169
  """
81
170
  Retry locking until all the keys are locked.
@@ -83,14 +172,16 @@ async def _wait_to_lock_many(
83
172
  The keys must be sorted to prevent deadlock.
84
173
  """
85
174
  left_to_lock = keys.copy()
86
- while len(left_to_lock) > 0:
175
+ while True:
87
176
  async with lock:
88
177
  locked_now_num = 0
89
178
  for key in left_to_lock:
90
179
  if key in locked:
91
- # Someone already aquired the lock, wait
180
+ # Someone already acquired the lock, wait
92
181
  break
93
182
  locked.add(key)
94
183
  locked_now_num += 1
95
184
  left_to_lock = left_to_lock[locked_now_num:]
185
+ if not left_to_lock:
186
+ return
96
187
  await asyncio.sleep(delay)
@@ -1,12 +1,14 @@
1
1
  from typing import Union
2
2
 
3
- from dstack._internal.server.models import JobModel, RunModel
3
+ from dstack._internal.server.models import GatewayModel, JobModel, RunModel
4
4
 
5
5
 
6
- def fmt(model: Union[RunModel, JobModel]) -> str:
6
+ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
7
7
  """Consistent string representation of a model for logging."""
8
8
  if isinstance(model, RunModel):
9
9
  return f"run({model.id.hex[:6]}){model.run_name}"
10
10
  if isinstance(model, JobModel):
11
11
  return f"job({model.id.hex[:6]}){model.job_name}"
12
+ if isinstance(model, GatewayModel):
13
+ return f"gateway({model.id.hex[:6]}){model.name}"
12
14
  return str(model)
@@ -8,7 +8,11 @@ from dstack._internal.server.models import ProjectModel
8
8
  from dstack._internal.server.schemas.logs import PollLogsRequest
9
9
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
10
10
  from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
11
- from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
11
+ from dstack._internal.server.services.logs.base import (
12
+ LogStorage,
13
+ LogStorageError,
14
+ b64encode_raw_message,
15
+ )
12
16
  from dstack._internal.server.services.logs.filelog import FileLogStorage
13
17
  from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
14
18
  from dstack._internal.utils.common import run_async
@@ -75,4 +79,13 @@ def write_logs(
75
79
 
76
80
 
77
81
  async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
78
- return await run_async(get_log_storage().poll_logs, project=project, request=request)
82
+ job_submission_logs = await run_async(
83
+ get_log_storage().poll_logs, project=project, request=request
84
+ )
85
+ # Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility.
86
+ # Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI.
87
+ # We live with that.
88
+ # TODO: Drop base64 encoding in 0.20.
89
+ for log_event in job_submission_logs.logs:
90
+ log_event.message = b64encode_raw_message(log_event.message.encode())
91
+ return job_submission_logs
@@ -17,7 +17,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
17
17
  from dstack._internal.server.services.logs.base import (
18
18
  LogStorage,
19
19
  LogStorageError,
20
- b64encode_raw_message,
21
20
  datetime_to_unix_time_ms,
22
21
  unix_time_ms_to_datetime,
23
22
  )
@@ -238,8 +237,7 @@ class CloudWatchLogStorage(LogStorage):
238
237
  skipped_future_events += 1
239
238
  continue
240
239
  cw_event = self._runner_log_event_to_cloudwatch_event(event)
241
- # as message is base64-encoded, length in bytes = length in code points.
242
- message_size = len(cw_event["message"]) + self.MESSAGE_OVERHEAD_SIZE
240
+ message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
243
241
  if message_size > self.MESSAGE_MAX_SIZE:
244
242
  # we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
245
243
  # which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
@@ -271,7 +269,7 @@ class CloudWatchLogStorage(LogStorage):
271
269
  ) -> _CloudWatchLogEvent:
272
270
  return {
273
271
  "timestamp": runner_log_event.timestamp,
274
- "message": b64encode_raw_message(runner_log_event.message),
272
+ "message": runner_log_event.message.decode(errors="replace"),
275
273
  }
276
274
 
277
275
  @contextmanager
@@ -2,6 +2,7 @@ from pathlib import Path
2
2
  from typing import List, Union
3
3
  from uuid import UUID
4
4
 
5
+ from dstack._internal.core.errors import ServerClientError
5
6
  from dstack._internal.core.models.logs import (
6
7
  JobSubmissionLogs,
7
8
  LogEvent,
@@ -14,8 +15,6 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
14
15
  from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
15
16
  from dstack._internal.server.services.logs.base import (
16
17
  LogStorage,
17
- LogStorageError,
18
- b64encode_raw_message,
19
18
  unix_time_ms_to_datetime,
20
19
  )
21
20
 
@@ -30,9 +29,6 @@ class FileLogStorage(LogStorage):
30
29
  self.root = Path(root)
31
30
 
32
31
  def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
33
- if request.descending:
34
- raise LogStorageError("descending: true is not supported")
35
-
36
32
  log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
37
33
  log_file_path = self._get_log_file_path(
38
34
  project_name=project.name,
@@ -46,11 +42,11 @@ class FileLogStorage(LogStorage):
46
42
  try:
47
43
  start_line = int(request.next_token)
48
44
  if start_line < 0:
49
- raise LogStorageError(
45
+ raise ServerClientError(
50
46
  f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
51
47
  )
52
48
  except ValueError:
53
- raise LogStorageError(
49
+ raise ServerClientError(
54
50
  f"Invalid next_token: {request.next_token}. Must be a valid integer."
55
51
  )
56
52
 
@@ -60,31 +56,41 @@ class FileLogStorage(LogStorage):
60
56
 
61
57
  try:
62
58
  with open(log_file_path) as f:
63
- lines = f.readlines()
64
-
65
- for i, line in enumerate(lines):
66
- if current_line < start_line:
59
+ # Skip to start_line if needed
60
+ for _ in range(start_line):
61
+ if f.readline() == "":
62
+ # File is shorter than start_line
63
+ return JobSubmissionLogs(logs=logs, next_token=next_token)
67
64
  current_line += 1
68
- continue
69
65
 
70
- log_event = LogEvent.__response__.parse_raw(line)
71
- current_line += 1
66
+ # Read lines one by one
67
+ while True:
68
+ line = f.readline()
69
+ if line == "": # EOF
70
+ break
71
+
72
+ current_line += 1
72
73
 
73
- if request.start_time and log_event.timestamp <= request.start_time:
74
- continue
75
- if request.end_time is not None and log_event.timestamp >= request.end_time:
76
- break
74
+ try:
75
+ log_event = LogEvent.__response__.parse_raw(line)
76
+ except Exception:
77
+ # Skip malformed lines
78
+ continue
77
79
 
78
- logs.append(log_event)
80
+ if request.start_time and log_event.timestamp <= request.start_time:
81
+ continue
82
+ if request.end_time is not None and log_event.timestamp >= request.end_time:
83
+ break
79
84
 
80
- if len(logs) >= request.limit:
81
- # Only set next_token if there are more lines to read
82
- if current_line < len(lines):
83
- next_token = str(current_line)
84
- break
85
+ logs.append(log_event)
85
86
 
86
- except IOError as e:
87
- raise LogStorageError(f"Failed to read log file {log_file_path}: {e}")
87
+ if len(logs) >= request.limit:
88
+ # Check if there are more lines to read
89
+ if f.readline() != "":
90
+ next_token = str(current_line)
91
+ break
92
+ except FileNotFoundError:
93
+ pass
88
94
 
89
95
  return JobSubmissionLogs(logs=logs, next_token=next_token)
90
96
 
@@ -140,5 +146,5 @@ class FileLogStorage(LogStorage):
140
146
  return LogEvent(
141
147
  timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp),
142
148
  log_source=LogEventSource.STDOUT,
143
- message=b64encode_raw_message(runner_log_event.message),
149
+ message=runner_log_event.message.decode(errors="replace"),
144
150
  )
@@ -14,7 +14,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
14
14
  from dstack._internal.server.services.logs.base import (
15
15
  LogStorage,
16
16
  LogStorageError,
17
- b64encode_raw_message,
18
17
  unix_time_ms_to_datetime,
19
18
  )
20
19
  from dstack._internal.utils.common import batched
@@ -137,15 +136,14 @@ class GCPLogStorage(LogStorage):
137
136
  with self.logger.batch() as batcher:
138
137
  for batch in batched(logs, self.MAX_BATCH_SIZE):
139
138
  for log in batch:
140
- message = b64encode_raw_message(log.message)
139
+ message = log.message.decode(errors="replace")
141
140
  timestamp = unix_time_ms_to_datetime(log.timestamp)
142
- # as message is base64-encoded, length in bytes = length in code points
143
- if len(message) > self.MAX_RUNNER_MESSAGE_SIZE:
141
+ if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
144
142
  logger.error(
145
143
  "Stream %s: skipping event at %s, message exceeds max size: %d > %d",
146
144
  stream_name,
147
145
  timestamp.isoformat(),
148
- len(message),
146
+ len(log.message),
149
147
  self.MAX_RUNNER_MESSAGE_SIZE,
150
148
  )
151
149
  continue
@@ -12,10 +12,12 @@ from dstack._internal.core.models.configurations import ServiceConfiguration
12
12
  from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
13
13
  from dstack._internal.core.models.runs import (
14
14
  JobProvisioningData,
15
+ JobSpec,
15
16
  JobStatus,
16
17
  RunSpec,
17
18
  RunStatus,
18
19
  ServiceSpec,
20
+ get_service_port,
19
21
  )
20
22
  from dstack._internal.core.models.services import AnyModel
21
23
  from dstack._internal.proxy.lib.models import (
@@ -97,9 +99,10 @@ class ServerProxyRepo(BaseProxyRepo):
97
99
  if rci.ssh_proxy is not None:
98
100
  ssh_head_proxy = rci.ssh_proxy
99
101
  ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
102
+ job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
100
103
  replica = Replica(
101
104
  id=job.id.hex,
102
- app_port=run_spec.configuration.port.container_port,
105
+ app_port=get_service_port(job_spec, run_spec.configuration),
103
106
  ssh_destination=ssh_destination,
104
107
  ssh_port=ssh_port,
105
108
  ssh_proxy=ssh_proxy,