dstack 0.19.17__py3-none-any.whl → 0.19.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +111 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/backends/aws/compute.py +237 -18
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/cudo/compute.py +23 -9
- dstack/_internal/core/backends/gcp/compute.py +13 -7
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/compatibility/fleets.py +12 -11
- dstack/_internal/core/compatibility/gateways.py +9 -8
- dstack/_internal/core/compatibility/logs.py +4 -3
- dstack/_internal/core/compatibility/runs.py +29 -21
- dstack/_internal/core/compatibility/volumes.py +11 -8
- dstack/_internal/core/errors.py +4 -0
- dstack/_internal/core/models/common.py +45 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +56 -3
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +37 -9
- dstack/_internal/server/background/__init__.py +66 -40
- dstack/_internal/server/background/tasks/process_fleets.py +19 -3
- dstack/_internal/server/background/tasks/process_gateways.py +47 -29
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_instances.py +13 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_runs.py +8 -4
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +38 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
- dstack/_internal/server/background/tasks/process_volumes.py +2 -2
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +2 -2
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +358 -75
- dstack/_internal/server/services/gateways/__init__.py +17 -6
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +12 -1
- dstack/_internal/server/services/locking.py +104 -13
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +2 -4
- dstack/_internal/server/services/logs/filelog.py +33 -27
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +139 -72
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +15 -2
- dstack/_internal/server/settings.py +25 -6
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-64f8273740c4b52c18f5.js} +71 -67
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
- dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
- dstack/_internal/server/testing/common.py +48 -8
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/METADATA +17 -14
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/RECORD +86 -83
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -7,9 +7,9 @@ from pydantic import parse_obj_as
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
9
9
|
from dstack._internal.core.errors import GatewayError
|
|
10
|
-
from dstack._internal.core.models.configurations import RateLimit
|
|
10
|
+
from dstack._internal.core.models.configurations import RateLimit, ServiceConfiguration
|
|
11
11
|
from dstack._internal.core.models.instances import SSHConnectionParams
|
|
12
|
-
from dstack._internal.core.models.runs import JobSubmission, Run
|
|
12
|
+
from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port
|
|
13
13
|
from dstack._internal.proxy.gateway.schemas.stats import ServiceStats
|
|
14
14
|
from dstack._internal.server import settings
|
|
15
15
|
|
|
@@ -80,13 +80,15 @@ class GatewayClient:
|
|
|
80
80
|
async def register_replica(
|
|
81
81
|
self,
|
|
82
82
|
run: Run,
|
|
83
|
+
job_spec: JobSpec,
|
|
83
84
|
job_submission: JobSubmission,
|
|
84
85
|
ssh_head_proxy: Optional[SSHConnectionParams],
|
|
85
86
|
ssh_head_proxy_private_key: Optional[str],
|
|
86
87
|
):
|
|
88
|
+
assert isinstance(run.run_spec.configuration, ServiceConfiguration)
|
|
87
89
|
payload = {
|
|
88
90
|
"job_id": job_submission.id.hex,
|
|
89
|
-
"app_port": run.run_spec.configuration
|
|
91
|
+
"app_port": get_service_port(job_spec, run.run_spec.configuration),
|
|
90
92
|
"ssh_head_proxy": ssh_head_proxy.dict() if ssh_head_proxy is not None else None,
|
|
91
93
|
"ssh_head_proxy_private_key": ssh_head_proxy_private_key,
|
|
92
94
|
}
|
|
@@ -106,6 +106,14 @@ def get_instance_requirements(instance_model: InstanceModel) -> Requirements:
|
|
|
106
106
|
return Requirements.__response__.parse_raw(instance_model.requirements)
|
|
107
107
|
|
|
108
108
|
|
|
109
|
+
def get_instance_remote_connection_info(
|
|
110
|
+
instance_model: InstanceModel,
|
|
111
|
+
) -> Optional[RemoteConnectionInfo]:
|
|
112
|
+
if instance_model.remote_connection_info is None:
|
|
113
|
+
return None
|
|
114
|
+
return RemoteConnectionInfo.__response__.parse_raw(instance_model.remote_connection_info)
|
|
115
|
+
|
|
116
|
+
|
|
109
117
|
def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, Optional[str]]:
|
|
110
118
|
"""
|
|
111
119
|
Returns a pair of SSH private keys: host key and optional proxy jump key.
|
|
@@ -134,6 +134,8 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
134
134
|
finished_at = None
|
|
135
135
|
if job_model.status.is_finished():
|
|
136
136
|
finished_at = last_processed_at
|
|
137
|
+
status_message = _get_job_status_message(job_model)
|
|
138
|
+
error = _get_job_error(job_model)
|
|
137
139
|
return JobSubmission(
|
|
138
140
|
id=job_model.id,
|
|
139
141
|
submission_num=job_model.submission_num,
|
|
@@ -143,11 +145,13 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
143
145
|
finished_at=finished_at,
|
|
144
146
|
inactivity_secs=job_model.inactivity_secs,
|
|
145
147
|
status=job_model.status,
|
|
148
|
+
status_message=status_message,
|
|
146
149
|
termination_reason=job_model.termination_reason,
|
|
147
150
|
termination_reason_message=job_model.termination_reason_message,
|
|
148
151
|
exit_status=job_model.exit_status,
|
|
149
152
|
job_provisioning_data=job_provisioning_data,
|
|
150
153
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
154
|
+
error=error,
|
|
151
155
|
)
|
|
152
156
|
|
|
153
157
|
|
|
@@ -289,6 +293,19 @@ async def process_terminating_job(
|
|
|
289
293
|
# so that stuck volumes don't prevent the instance from terminating.
|
|
290
294
|
job_model.instance_id = None
|
|
291
295
|
instance_model.last_job_processed_at = common.get_current_datetime()
|
|
296
|
+
|
|
297
|
+
volume_names = (
|
|
298
|
+
jrd.volume_names
|
|
299
|
+
if jrd and jrd.volume_names
|
|
300
|
+
else [va.volume.name for va in instance_model.volume_attachments]
|
|
301
|
+
)
|
|
302
|
+
if volume_names:
|
|
303
|
+
volumes = await list_project_volume_models(
|
|
304
|
+
session=session, project=instance_model.project, names=volume_names
|
|
305
|
+
)
|
|
306
|
+
for volume in volumes:
|
|
307
|
+
volume.last_job_processed_at = common.get_current_datetime()
|
|
308
|
+
|
|
292
309
|
logger.info(
|
|
293
310
|
"%s: instance '%s' has been released, new status is %s",
|
|
294
311
|
fmt(job_model),
|
|
@@ -693,3 +710,31 @@ def _get_job_mount_point_attached_volume(
|
|
|
693
710
|
continue
|
|
694
711
|
return volume
|
|
695
712
|
raise ServerClientError("Failed to find an eligible volume for the mount point")
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _get_job_status_message(job_model: JobModel) -> str:
|
|
716
|
+
if job_model.status == JobStatus.DONE:
|
|
717
|
+
return "exited (0)"
|
|
718
|
+
elif job_model.status == JobStatus.FAILED:
|
|
719
|
+
if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
720
|
+
return f"exited ({job_model.exit_status})"
|
|
721
|
+
elif (
|
|
722
|
+
job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
723
|
+
):
|
|
724
|
+
return "no offers"
|
|
725
|
+
elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
726
|
+
return "interrupted"
|
|
727
|
+
else:
|
|
728
|
+
return "error"
|
|
729
|
+
elif job_model.status == JobStatus.TERMINATED:
|
|
730
|
+
if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
731
|
+
return "stopped"
|
|
732
|
+
elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
733
|
+
return "aborted"
|
|
734
|
+
return job_model.status.value
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def _get_job_error(job_model: JobModel) -> Optional[str]:
|
|
738
|
+
if job_model.termination_reason is None:
|
|
739
|
+
return None
|
|
740
|
+
return job_model.termination_reason.to_error()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import shlex
|
|
2
2
|
import sys
|
|
3
|
+
import threading
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
from pathlib import PurePosixPath
|
|
5
6
|
from typing import Dict, List, Optional, Union
|
|
@@ -14,6 +15,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
14
15
|
PortMapping,
|
|
15
16
|
PythonVersion,
|
|
16
17
|
RunConfigurationType,
|
|
18
|
+
ServiceConfiguration,
|
|
17
19
|
)
|
|
18
20
|
from dstack._internal.core.models.profiles import (
|
|
19
21
|
DEFAULT_STOP_DURATION,
|
|
@@ -152,6 +154,7 @@ class JobConfigurator(ABC):
|
|
|
152
154
|
repo_data=self.run_spec.repo_data,
|
|
153
155
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
154
156
|
file_archives=self.run_spec.file_archives,
|
|
157
|
+
service_port=self._service_port(),
|
|
155
158
|
)
|
|
156
159
|
return job_spec
|
|
157
160
|
|
|
@@ -305,6 +308,11 @@ class JobConfigurator(ABC):
|
|
|
305
308
|
)
|
|
306
309
|
return self._job_ssh_key
|
|
307
310
|
|
|
311
|
+
def _service_port(self) -> Optional[int]:
|
|
312
|
+
if isinstance(self.run_spec.configuration, ServiceConfiguration):
|
|
313
|
+
return self.run_spec.configuration.port.container_port
|
|
314
|
+
return None
|
|
315
|
+
|
|
308
316
|
|
|
309
317
|
def interpolate_job_volumes(
|
|
310
318
|
run_volumes: List[Union[MountPoint, str]],
|
|
@@ -354,7 +362,10 @@ def _join_shell_commands(commands: List[str]) -> str:
|
|
|
354
362
|
return " && ".join(commands)
|
|
355
363
|
|
|
356
364
|
|
|
357
|
-
@cached(
|
|
365
|
+
@cached(
|
|
366
|
+
cache=TTLCache(maxsize=2048, ttl=80),
|
|
367
|
+
lock=threading.Lock(),
|
|
368
|
+
)
|
|
358
369
|
def _get_image_config(image: str, registry_auth: Optional[RegistryAuth]) -> ImageConfig:
|
|
359
370
|
try:
|
|
360
371
|
return get_image_config(image, registry_auth).config
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import collections.abc
|
|
2
3
|
import hashlib
|
|
4
|
+
from abc import abstractmethod
|
|
3
5
|
from asyncio import Lock
|
|
4
6
|
from contextlib import asynccontextmanager
|
|
5
|
-
from typing import AsyncGenerator,
|
|
7
|
+
from typing import AsyncGenerator, Iterable, Iterator, Protocol, TypeVar, Union
|
|
6
8
|
|
|
7
9
|
from sqlalchemy import func, select
|
|
8
10
|
from sqlalchemy.ext.asyncio import AsyncConnection, AsyncSession
|
|
@@ -10,23 +12,54 @@ from sqlalchemy.ext.asyncio import AsyncConnection, AsyncSession
|
|
|
10
12
|
KeyT = TypeVar("KeyT")
|
|
11
13
|
|
|
12
14
|
|
|
13
|
-
class
|
|
14
|
-
def
|
|
15
|
-
|
|
15
|
+
class LocksetLock(Protocol):
|
|
16
|
+
async def acquire(self) -> bool: ...
|
|
17
|
+
def release(self) -> None: ...
|
|
18
|
+
async def __aenter__(self): ...
|
|
19
|
+
async def __aexit__(self, exc_type, exc, tb): ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
T = TypeVar("T")
|
|
23
|
+
|
|
16
24
|
|
|
17
|
-
|
|
25
|
+
class Lockset(Protocol[T]):
|
|
26
|
+
def __contains__(self, item: T) -> bool: ...
|
|
27
|
+
def __iter__(self) -> Iterator[T]: ...
|
|
28
|
+
def __len__(self) -> int: ...
|
|
29
|
+
def add(self, item: T) -> None: ...
|
|
30
|
+
def discard(self, item: T) -> None: ...
|
|
31
|
+
def update(self, other: Iterable[T]) -> None: ...
|
|
32
|
+
def difference_update(self, other: Iterable[T]) -> None: ...
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ResourceLocker:
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def get_lockset(self, namespace: str) -> tuple[LocksetLock, Lockset]:
|
|
18
38
|
"""
|
|
19
39
|
Returns a lockset containing locked resources for in-memory locking.
|
|
20
40
|
Also returns a lock that guards the lockset.
|
|
21
41
|
"""
|
|
22
|
-
|
|
42
|
+
pass
|
|
23
43
|
|
|
44
|
+
@abstractmethod
|
|
24
45
|
@asynccontextmanager
|
|
25
|
-
async def lock_ctx(self, namespace: str, keys:
|
|
46
|
+
async def lock_ctx(self, namespace: str, keys: list[KeyT]):
|
|
26
47
|
"""
|
|
27
48
|
Acquires locks for all keys in namespace.
|
|
28
49
|
The keys must be sorted to prevent deadlock.
|
|
29
50
|
"""
|
|
51
|
+
yield
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class InMemoryResourceLocker(ResourceLocker):
|
|
55
|
+
def __init__(self):
|
|
56
|
+
self.namespace_to_locks_map: dict[str, tuple[Lock, set]] = {}
|
|
57
|
+
|
|
58
|
+
def get_lockset(self, namespace: str) -> tuple[Lock, set]:
|
|
59
|
+
return self.namespace_to_locks_map.setdefault(namespace, (Lock(), set()))
|
|
60
|
+
|
|
61
|
+
@asynccontextmanager
|
|
62
|
+
async def lock_ctx(self, namespace: str, keys: list[KeyT]):
|
|
30
63
|
lock, lockset = self.get_lockset(namespace)
|
|
31
64
|
try:
|
|
32
65
|
await _wait_to_lock_many(lock, lockset, keys)
|
|
@@ -35,6 +68,56 @@ class ResourceLocker:
|
|
|
35
68
|
lockset.difference_update(keys)
|
|
36
69
|
|
|
37
70
|
|
|
71
|
+
class DummyAsyncLock:
|
|
72
|
+
async def __aenter__(self):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
async def __aexit__(self, exc_type, exc, tb):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
async def acquire(self):
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
def release(self):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class DummySet(collections.abc.MutableSet):
|
|
86
|
+
def __contains__(self, item):
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
def __iter__(self):
|
|
90
|
+
return iter(())
|
|
91
|
+
|
|
92
|
+
def __len__(self):
|
|
93
|
+
return 0
|
|
94
|
+
|
|
95
|
+
def add(self, value):
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
def discard(self, value):
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
def update(self, other):
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
def difference_update(self, other):
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class DummyResourceLocker(ResourceLocker):
|
|
109
|
+
def __init__(self):
|
|
110
|
+
self.lock = DummyAsyncLock()
|
|
111
|
+
self.lockset = DummySet()
|
|
112
|
+
|
|
113
|
+
def get_lockset(self, namespace: str) -> tuple[DummyAsyncLock, DummySet]:
|
|
114
|
+
return self.lock, self.lockset
|
|
115
|
+
|
|
116
|
+
@asynccontextmanager
|
|
117
|
+
async def lock_ctx(self, namespace: str, keys: list[KeyT]):
|
|
118
|
+
yield
|
|
119
|
+
|
|
120
|
+
|
|
38
121
|
def string_to_lock_id(s: str) -> int:
|
|
39
122
|
return int(hashlib.sha256(s.encode()).hexdigest(), 16) % (2**63)
|
|
40
123
|
|
|
@@ -67,15 +150,21 @@ async def try_advisory_lock_ctx(
|
|
|
67
150
|
await bind.execute(select(func.pg_advisory_unlock(string_to_lock_id(resource))))
|
|
68
151
|
|
|
69
152
|
|
|
70
|
-
|
|
153
|
+
_in_memory_locker = InMemoryResourceLocker()
|
|
154
|
+
_dummy_locker = DummyResourceLocker()
|
|
71
155
|
|
|
72
156
|
|
|
73
|
-
def get_locker() -> ResourceLocker:
|
|
74
|
-
|
|
157
|
+
def get_locker(dialect_name: str) -> ResourceLocker:
|
|
158
|
+
if dialect_name == "sqlite":
|
|
159
|
+
return _in_memory_locker
|
|
160
|
+
# We could use an in-memory locker on Postgres
|
|
161
|
+
# but it can lead to unnecessary lock contention,
|
|
162
|
+
# so we use a dummy locker that does not take any locks.
|
|
163
|
+
return _dummy_locker
|
|
75
164
|
|
|
76
165
|
|
|
77
166
|
async def _wait_to_lock_many(
|
|
78
|
-
lock: asyncio.Lock, locked:
|
|
167
|
+
lock: asyncio.Lock, locked: set[KeyT], keys: list[KeyT], *, delay: float = 0.1
|
|
79
168
|
):
|
|
80
169
|
"""
|
|
81
170
|
Retry locking until all the keys are locked.
|
|
@@ -83,14 +172,16 @@ async def _wait_to_lock_many(
|
|
|
83
172
|
The keys must be sorted to prevent deadlock.
|
|
84
173
|
"""
|
|
85
174
|
left_to_lock = keys.copy()
|
|
86
|
-
while
|
|
175
|
+
while True:
|
|
87
176
|
async with lock:
|
|
88
177
|
locked_now_num = 0
|
|
89
178
|
for key in left_to_lock:
|
|
90
179
|
if key in locked:
|
|
91
|
-
# Someone already
|
|
180
|
+
# Someone already acquired the lock, wait
|
|
92
181
|
break
|
|
93
182
|
locked.add(key)
|
|
94
183
|
locked_now_num += 1
|
|
95
184
|
left_to_lock = left_to_lock[locked_now_num:]
|
|
185
|
+
if not left_to_lock:
|
|
186
|
+
return
|
|
96
187
|
await asyncio.sleep(delay)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from typing import Union
|
|
2
2
|
|
|
3
|
-
from dstack._internal.server.models import JobModel, RunModel
|
|
3
|
+
from dstack._internal.server.models import GatewayModel, JobModel, RunModel
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def fmt(model: Union[RunModel, JobModel]) -> str:
|
|
6
|
+
def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
7
7
|
"""Consistent string representation of a model for logging."""
|
|
8
8
|
if isinstance(model, RunModel):
|
|
9
9
|
return f"run({model.id.hex[:6]}){model.run_name}"
|
|
10
10
|
if isinstance(model, JobModel):
|
|
11
11
|
return f"job({model.id.hex[:6]}){model.job_name}"
|
|
12
|
+
if isinstance(model, GatewayModel):
|
|
13
|
+
return f"gateway({model.id.hex[:6]}){model.name}"
|
|
12
14
|
return str(model)
|
|
@@ -8,7 +8,11 @@ from dstack._internal.server.models import ProjectModel
|
|
|
8
8
|
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
9
9
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
10
10
|
from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
|
|
11
|
-
from dstack._internal.server.services.logs.base import
|
|
11
|
+
from dstack._internal.server.services.logs.base import (
|
|
12
|
+
LogStorage,
|
|
13
|
+
LogStorageError,
|
|
14
|
+
b64encode_raw_message,
|
|
15
|
+
)
|
|
12
16
|
from dstack._internal.server.services.logs.filelog import FileLogStorage
|
|
13
17
|
from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
|
|
14
18
|
from dstack._internal.utils.common import run_async
|
|
@@ -75,4 +79,13 @@ def write_logs(
|
|
|
75
79
|
|
|
76
80
|
|
|
77
81
|
async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
78
|
-
|
|
82
|
+
job_submission_logs = await run_async(
|
|
83
|
+
get_log_storage().poll_logs, project=project, request=request
|
|
84
|
+
)
|
|
85
|
+
# Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility.
|
|
86
|
+
# Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI.
|
|
87
|
+
# We live with that.
|
|
88
|
+
# TODO: Drop base64 encoding in 0.20.
|
|
89
|
+
for log_event in job_submission_logs.logs:
|
|
90
|
+
log_event.message = b64encode_raw_message(log_event.message.encode())
|
|
91
|
+
return job_submission_logs
|
|
@@ -17,7 +17,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
|
17
17
|
from dstack._internal.server.services.logs.base import (
|
|
18
18
|
LogStorage,
|
|
19
19
|
LogStorageError,
|
|
20
|
-
b64encode_raw_message,
|
|
21
20
|
datetime_to_unix_time_ms,
|
|
22
21
|
unix_time_ms_to_datetime,
|
|
23
22
|
)
|
|
@@ -238,8 +237,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
238
237
|
skipped_future_events += 1
|
|
239
238
|
continue
|
|
240
239
|
cw_event = self._runner_log_event_to_cloudwatch_event(event)
|
|
241
|
-
|
|
242
|
-
message_size = len(cw_event["message"]) + self.MESSAGE_OVERHEAD_SIZE
|
|
240
|
+
message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
|
|
243
241
|
if message_size > self.MESSAGE_MAX_SIZE:
|
|
244
242
|
# we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
|
|
245
243
|
# which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
|
|
@@ -271,7 +269,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
271
269
|
) -> _CloudWatchLogEvent:
|
|
272
270
|
return {
|
|
273
271
|
"timestamp": runner_log_event.timestamp,
|
|
274
|
-
"message":
|
|
272
|
+
"message": runner_log_event.message.decode(errors="replace"),
|
|
275
273
|
}
|
|
276
274
|
|
|
277
275
|
@contextmanager
|
|
@@ -2,6 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
from typing import List, Union
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
|
+
from dstack._internal.core.errors import ServerClientError
|
|
5
6
|
from dstack._internal.core.models.logs import (
|
|
6
7
|
JobSubmissionLogs,
|
|
7
8
|
LogEvent,
|
|
@@ -14,8 +15,6 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
|
14
15
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
15
16
|
from dstack._internal.server.services.logs.base import (
|
|
16
17
|
LogStorage,
|
|
17
|
-
LogStorageError,
|
|
18
|
-
b64encode_raw_message,
|
|
19
18
|
unix_time_ms_to_datetime,
|
|
20
19
|
)
|
|
21
20
|
|
|
@@ -30,9 +29,6 @@ class FileLogStorage(LogStorage):
|
|
|
30
29
|
self.root = Path(root)
|
|
31
30
|
|
|
32
31
|
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
33
|
-
if request.descending:
|
|
34
|
-
raise LogStorageError("descending: true is not supported")
|
|
35
|
-
|
|
36
32
|
log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
37
33
|
log_file_path = self._get_log_file_path(
|
|
38
34
|
project_name=project.name,
|
|
@@ -46,11 +42,11 @@ class FileLogStorage(LogStorage):
|
|
|
46
42
|
try:
|
|
47
43
|
start_line = int(request.next_token)
|
|
48
44
|
if start_line < 0:
|
|
49
|
-
raise
|
|
45
|
+
raise ServerClientError(
|
|
50
46
|
f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
|
|
51
47
|
)
|
|
52
48
|
except ValueError:
|
|
53
|
-
raise
|
|
49
|
+
raise ServerClientError(
|
|
54
50
|
f"Invalid next_token: {request.next_token}. Must be a valid integer."
|
|
55
51
|
)
|
|
56
52
|
|
|
@@ -60,31 +56,41 @@ class FileLogStorage(LogStorage):
|
|
|
60
56
|
|
|
61
57
|
try:
|
|
62
58
|
with open(log_file_path) as f:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
59
|
+
# Skip to start_line if needed
|
|
60
|
+
for _ in range(start_line):
|
|
61
|
+
if f.readline() == "":
|
|
62
|
+
# File is shorter than start_line
|
|
63
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
67
64
|
current_line += 1
|
|
68
|
-
continue
|
|
69
65
|
|
|
70
|
-
|
|
71
|
-
|
|
66
|
+
# Read lines one by one
|
|
67
|
+
while True:
|
|
68
|
+
line = f.readline()
|
|
69
|
+
if line == "": # EOF
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
current_line += 1
|
|
72
73
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
try:
|
|
75
|
+
log_event = LogEvent.__response__.parse_raw(line)
|
|
76
|
+
except Exception:
|
|
77
|
+
# Skip malformed lines
|
|
78
|
+
continue
|
|
77
79
|
|
|
78
|
-
|
|
80
|
+
if request.start_time and log_event.timestamp <= request.start_time:
|
|
81
|
+
continue
|
|
82
|
+
if request.end_time is not None and log_event.timestamp >= request.end_time:
|
|
83
|
+
break
|
|
79
84
|
|
|
80
|
-
|
|
81
|
-
# Only set next_token if there are more lines to read
|
|
82
|
-
if current_line < len(lines):
|
|
83
|
-
next_token = str(current_line)
|
|
84
|
-
break
|
|
85
|
+
logs.append(log_event)
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
if len(logs) >= request.limit:
|
|
88
|
+
# Check if there are more lines to read
|
|
89
|
+
if f.readline() != "":
|
|
90
|
+
next_token = str(current_line)
|
|
91
|
+
break
|
|
92
|
+
except FileNotFoundError:
|
|
93
|
+
pass
|
|
88
94
|
|
|
89
95
|
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
90
96
|
|
|
@@ -140,5 +146,5 @@ class FileLogStorage(LogStorage):
|
|
|
140
146
|
return LogEvent(
|
|
141
147
|
timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp),
|
|
142
148
|
log_source=LogEventSource.STDOUT,
|
|
143
|
-
message=
|
|
149
|
+
message=runner_log_event.message.decode(errors="replace"),
|
|
144
150
|
)
|
|
@@ -14,7 +14,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
|
14
14
|
from dstack._internal.server.services.logs.base import (
|
|
15
15
|
LogStorage,
|
|
16
16
|
LogStorageError,
|
|
17
|
-
b64encode_raw_message,
|
|
18
17
|
unix_time_ms_to_datetime,
|
|
19
18
|
)
|
|
20
19
|
from dstack._internal.utils.common import batched
|
|
@@ -137,15 +136,14 @@ class GCPLogStorage(LogStorage):
|
|
|
137
136
|
with self.logger.batch() as batcher:
|
|
138
137
|
for batch in batched(logs, self.MAX_BATCH_SIZE):
|
|
139
138
|
for log in batch:
|
|
140
|
-
message =
|
|
139
|
+
message = log.message.decode(errors="replace")
|
|
141
140
|
timestamp = unix_time_ms_to_datetime(log.timestamp)
|
|
142
|
-
|
|
143
|
-
if len(message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
141
|
+
if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
144
142
|
logger.error(
|
|
145
143
|
"Stream %s: skipping event at %s, message exceeds max size: %d > %d",
|
|
146
144
|
stream_name,
|
|
147
145
|
timestamp.isoformat(),
|
|
148
|
-
len(message),
|
|
146
|
+
len(log.message),
|
|
149
147
|
self.MAX_RUNNER_MESSAGE_SIZE,
|
|
150
148
|
)
|
|
151
149
|
continue
|
|
@@ -12,10 +12,12 @@ from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
|
12
12
|
from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
|
|
13
13
|
from dstack._internal.core.models.runs import (
|
|
14
14
|
JobProvisioningData,
|
|
15
|
+
JobSpec,
|
|
15
16
|
JobStatus,
|
|
16
17
|
RunSpec,
|
|
17
18
|
RunStatus,
|
|
18
19
|
ServiceSpec,
|
|
20
|
+
get_service_port,
|
|
19
21
|
)
|
|
20
22
|
from dstack._internal.core.models.services import AnyModel
|
|
21
23
|
from dstack._internal.proxy.lib.models import (
|
|
@@ -97,9 +99,10 @@ class ServerProxyRepo(BaseProxyRepo):
|
|
|
97
99
|
if rci.ssh_proxy is not None:
|
|
98
100
|
ssh_head_proxy = rci.ssh_proxy
|
|
99
101
|
ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
|
|
102
|
+
job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
|
|
100
103
|
replica = Replica(
|
|
101
104
|
id=job.id.hex,
|
|
102
|
-
app_port=run_spec.configuration
|
|
105
|
+
app_port=get_service_port(job_spec, run_spec.configuration),
|
|
103
106
|
ssh_destination=ssh_destination,
|
|
104
107
|
ssh_port=ssh_port,
|
|
105
108
|
ssh_proxy=ssh_proxy,
|