dstack 0.19.18__py3-none-any.whl → 0.19.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +99 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
- dstack/_internal/core/backends/oci/resources.py +5 -5
- dstack/_internal/core/compatibility/runs.py +12 -1
- dstack/_internal/core/compatibility/volumes.py +2 -0
- dstack/_internal/core/models/common.py +38 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +30 -10
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +17 -9
- dstack/_internal/server/background/__init__.py +5 -3
- dstack/_internal/server/background/tasks/process_gateways.py +46 -28
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +3 -11
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +354 -72
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +7 -0
- dstack/_internal/server/services/locking.py +3 -1
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +47 -7
- dstack/_internal/server/services/logs/filelog.py +148 -32
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/prometheus/custom_metrics.py +20 -0
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +115 -32
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +13 -0
- dstack/_internal/server/settings.py +7 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-39a767528976f8078166.js} +11 -30
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-39a767528976f8078166.js.map} +1 -1
- dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
- dstack/_internal/server/testing/common.py +41 -5
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/common.py +10 -21
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/METADATA +7 -5
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/RECORD +74 -71
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/WHEEL +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -7,9 +7,9 @@ from pydantic import parse_obj_as
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
9
9
|
from dstack._internal.core.errors import GatewayError
|
|
10
|
-
from dstack._internal.core.models.configurations import RateLimit
|
|
10
|
+
from dstack._internal.core.models.configurations import RateLimit, ServiceConfiguration
|
|
11
11
|
from dstack._internal.core.models.instances import SSHConnectionParams
|
|
12
|
-
from dstack._internal.core.models.runs import JobSubmission, Run
|
|
12
|
+
from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port
|
|
13
13
|
from dstack._internal.proxy.gateway.schemas.stats import ServiceStats
|
|
14
14
|
from dstack._internal.server import settings
|
|
15
15
|
|
|
@@ -80,13 +80,15 @@ class GatewayClient:
|
|
|
80
80
|
async def register_replica(
|
|
81
81
|
self,
|
|
82
82
|
run: Run,
|
|
83
|
+
job_spec: JobSpec,
|
|
83
84
|
job_submission: JobSubmission,
|
|
84
85
|
ssh_head_proxy: Optional[SSHConnectionParams],
|
|
85
86
|
ssh_head_proxy_private_key: Optional[str],
|
|
86
87
|
):
|
|
88
|
+
assert isinstance(run.run_spec.configuration, ServiceConfiguration)
|
|
87
89
|
payload = {
|
|
88
90
|
"job_id": job_submission.id.hex,
|
|
89
|
-
"app_port": run.run_spec.configuration
|
|
91
|
+
"app_port": get_service_port(job_spec, run.run_spec.configuration),
|
|
90
92
|
"ssh_head_proxy": ssh_head_proxy.dict() if ssh_head_proxy is not None else None,
|
|
91
93
|
"ssh_head_proxy_private_key": ssh_head_proxy_private_key,
|
|
92
94
|
}
|
|
@@ -106,6 +106,14 @@ def get_instance_requirements(instance_model: InstanceModel) -> Requirements:
|
|
|
106
106
|
return Requirements.__response__.parse_raw(instance_model.requirements)
|
|
107
107
|
|
|
108
108
|
|
|
109
|
+
def get_instance_remote_connection_info(
|
|
110
|
+
instance_model: InstanceModel,
|
|
111
|
+
) -> Optional[RemoteConnectionInfo]:
|
|
112
|
+
if instance_model.remote_connection_info is None:
|
|
113
|
+
return None
|
|
114
|
+
return RemoteConnectionInfo.__response__.parse_raw(instance_model.remote_connection_info)
|
|
115
|
+
|
|
116
|
+
|
|
109
117
|
def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, Optional[str]]:
|
|
110
118
|
"""
|
|
111
119
|
Returns a pair of SSH private keys: host key and optional proxy jump key.
|
|
@@ -134,6 +134,8 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
134
134
|
finished_at = None
|
|
135
135
|
if job_model.status.is_finished():
|
|
136
136
|
finished_at = last_processed_at
|
|
137
|
+
status_message = _get_job_status_message(job_model)
|
|
138
|
+
error = _get_job_error(job_model)
|
|
137
139
|
return JobSubmission(
|
|
138
140
|
id=job_model.id,
|
|
139
141
|
submission_num=job_model.submission_num,
|
|
@@ -143,11 +145,13 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
143
145
|
finished_at=finished_at,
|
|
144
146
|
inactivity_secs=job_model.inactivity_secs,
|
|
145
147
|
status=job_model.status,
|
|
148
|
+
status_message=status_message,
|
|
146
149
|
termination_reason=job_model.termination_reason,
|
|
147
150
|
termination_reason_message=job_model.termination_reason_message,
|
|
148
151
|
exit_status=job_model.exit_status,
|
|
149
152
|
job_provisioning_data=job_provisioning_data,
|
|
150
153
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
154
|
+
error=error,
|
|
151
155
|
)
|
|
152
156
|
|
|
153
157
|
|
|
@@ -289,6 +293,19 @@ async def process_terminating_job(
|
|
|
289
293
|
# so that stuck volumes don't prevent the instance from terminating.
|
|
290
294
|
job_model.instance_id = None
|
|
291
295
|
instance_model.last_job_processed_at = common.get_current_datetime()
|
|
296
|
+
|
|
297
|
+
volume_names = (
|
|
298
|
+
jrd.volume_names
|
|
299
|
+
if jrd and jrd.volume_names
|
|
300
|
+
else [va.volume.name for va in instance_model.volume_attachments]
|
|
301
|
+
)
|
|
302
|
+
if volume_names:
|
|
303
|
+
volumes = await list_project_volume_models(
|
|
304
|
+
session=session, project=instance_model.project, names=volume_names
|
|
305
|
+
)
|
|
306
|
+
for volume in volumes:
|
|
307
|
+
volume.last_job_processed_at = common.get_current_datetime()
|
|
308
|
+
|
|
292
309
|
logger.info(
|
|
293
310
|
"%s: instance '%s' has been released, new status is %s",
|
|
294
311
|
fmt(job_model),
|
|
@@ -693,3 +710,31 @@ def _get_job_mount_point_attached_volume(
|
|
|
693
710
|
continue
|
|
694
711
|
return volume
|
|
695
712
|
raise ServerClientError("Failed to find an eligible volume for the mount point")
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _get_job_status_message(job_model: JobModel) -> str:
|
|
716
|
+
if job_model.status == JobStatus.DONE:
|
|
717
|
+
return "exited (0)"
|
|
718
|
+
elif job_model.status == JobStatus.FAILED:
|
|
719
|
+
if job_model.termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
720
|
+
return f"exited ({job_model.exit_status})"
|
|
721
|
+
elif (
|
|
722
|
+
job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
723
|
+
):
|
|
724
|
+
return "no offers"
|
|
725
|
+
elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
726
|
+
return "interrupted"
|
|
727
|
+
else:
|
|
728
|
+
return "error"
|
|
729
|
+
elif job_model.status == JobStatus.TERMINATED:
|
|
730
|
+
if job_model.termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
731
|
+
return "stopped"
|
|
732
|
+
elif job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
733
|
+
return "aborted"
|
|
734
|
+
return job_model.status.value
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def _get_job_error(job_model: JobModel) -> Optional[str]:
|
|
738
|
+
if job_model.termination_reason is None:
|
|
739
|
+
return None
|
|
740
|
+
return job_model.termination_reason.to_error()
|
|
@@ -15,6 +15,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
15
15
|
PortMapping,
|
|
16
16
|
PythonVersion,
|
|
17
17
|
RunConfigurationType,
|
|
18
|
+
ServiceConfiguration,
|
|
18
19
|
)
|
|
19
20
|
from dstack._internal.core.models.profiles import (
|
|
20
21
|
DEFAULT_STOP_DURATION,
|
|
@@ -153,6 +154,7 @@ class JobConfigurator(ABC):
|
|
|
153
154
|
repo_data=self.run_spec.repo_data,
|
|
154
155
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
155
156
|
file_archives=self.run_spec.file_archives,
|
|
157
|
+
service_port=self._service_port(),
|
|
156
158
|
)
|
|
157
159
|
return job_spec
|
|
158
160
|
|
|
@@ -306,6 +308,11 @@ class JobConfigurator(ABC):
|
|
|
306
308
|
)
|
|
307
309
|
return self._job_ssh_key
|
|
308
310
|
|
|
311
|
+
def _service_port(self) -> Optional[int]:
|
|
312
|
+
if isinstance(self.run_spec.configuration, ServiceConfiguration):
|
|
313
|
+
return self.run_spec.configuration.port.container_port
|
|
314
|
+
return None
|
|
315
|
+
|
|
309
316
|
|
|
310
317
|
def interpolate_job_volumes(
|
|
311
318
|
run_volumes: List[Union[MountPoint, str]],
|
|
@@ -172,7 +172,7 @@ async def _wait_to_lock_many(
|
|
|
172
172
|
The keys must be sorted to prevent deadlock.
|
|
173
173
|
"""
|
|
174
174
|
left_to_lock = keys.copy()
|
|
175
|
-
while
|
|
175
|
+
while True:
|
|
176
176
|
async with lock:
|
|
177
177
|
locked_now_num = 0
|
|
178
178
|
for key in left_to_lock:
|
|
@@ -182,4 +182,6 @@ async def _wait_to_lock_many(
|
|
|
182
182
|
locked.add(key)
|
|
183
183
|
locked_now_num += 1
|
|
184
184
|
left_to_lock = left_to_lock[locked_now_num:]
|
|
185
|
+
if not left_to_lock:
|
|
186
|
+
return
|
|
185
187
|
await asyncio.sleep(delay)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from typing import Union
|
|
2
2
|
|
|
3
|
-
from dstack._internal.server.models import JobModel, RunModel
|
|
3
|
+
from dstack._internal.server.models import GatewayModel, JobModel, RunModel
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def fmt(model: Union[RunModel, JobModel]) -> str:
|
|
6
|
+
def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
7
7
|
"""Consistent string representation of a model for logging."""
|
|
8
8
|
if isinstance(model, RunModel):
|
|
9
9
|
return f"run({model.id.hex[:6]}){model.run_name}"
|
|
10
10
|
if isinstance(model, JobModel):
|
|
11
11
|
return f"job({model.id.hex[:6]}){model.job_name}"
|
|
12
|
+
if isinstance(model, GatewayModel):
|
|
13
|
+
return f"gateway({model.id.hex[:6]}){model.name}"
|
|
12
14
|
return str(model)
|
|
@@ -8,7 +8,11 @@ from dstack._internal.server.models import ProjectModel
|
|
|
8
8
|
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
9
9
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
10
10
|
from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
|
|
11
|
-
from dstack._internal.server.services.logs.base import
|
|
11
|
+
from dstack._internal.server.services.logs.base import (
|
|
12
|
+
LogStorage,
|
|
13
|
+
LogStorageError,
|
|
14
|
+
b64encode_raw_message,
|
|
15
|
+
)
|
|
12
16
|
from dstack._internal.server.services.logs.filelog import FileLogStorage
|
|
13
17
|
from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
|
|
14
18
|
from dstack._internal.utils.common import run_async
|
|
@@ -75,4 +79,13 @@ def write_logs(
|
|
|
75
79
|
|
|
76
80
|
|
|
77
81
|
async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
78
|
-
|
|
82
|
+
job_submission_logs = await run_async(
|
|
83
|
+
get_log_storage().poll_logs, project=project, request=request
|
|
84
|
+
)
|
|
85
|
+
# Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility.
|
|
86
|
+
# Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI.
|
|
87
|
+
# We live with that.
|
|
88
|
+
# TODO: Drop base64 encoding in 0.20.
|
|
89
|
+
for log_event in job_submission_logs.logs:
|
|
90
|
+
log_event.message = b64encode_raw_message(log_event.message.encode())
|
|
91
|
+
return job_submission_logs
|
|
@@ -17,7 +17,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
|
17
17
|
from dstack._internal.server.services.logs.base import (
|
|
18
18
|
LogStorage,
|
|
19
19
|
LogStorageError,
|
|
20
|
-
b64encode_raw_message,
|
|
21
20
|
datetime_to_unix_time_ms,
|
|
22
21
|
unix_time_ms_to_datetime,
|
|
23
22
|
)
|
|
@@ -56,6 +55,8 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
56
55
|
PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
|
|
57
56
|
# "None of the log events in the batch can be more than 2 hours in the future."
|
|
58
57
|
FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
|
|
58
|
+
# Maximum number of retries when polling for log events to skip empty pages.
|
|
59
|
+
MAX_RETRIES = 10
|
|
59
60
|
|
|
60
61
|
def __init__(self, *, group: str, region: Optional[str] = None) -> None:
|
|
61
62
|
with self._wrap_boto_errors():
|
|
@@ -81,7 +82,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
81
82
|
next_token: Optional[str] = None
|
|
82
83
|
with self._wrap_boto_errors():
|
|
83
84
|
try:
|
|
84
|
-
cw_events, next_token = self.
|
|
85
|
+
cw_events, next_token = self._get_log_events_with_retry(stream, request)
|
|
85
86
|
except botocore.exceptions.ClientError as e:
|
|
86
87
|
if not self._is_resource_not_found_exception(e):
|
|
87
88
|
raise
|
|
@@ -102,7 +103,47 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
102
103
|
)
|
|
103
104
|
for cw_event in cw_events
|
|
104
105
|
]
|
|
105
|
-
return JobSubmissionLogs(logs=logs, next_token=next_token
|
|
106
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
107
|
+
|
|
108
|
+
def _get_log_events_with_retry(
|
|
109
|
+
self, stream: str, request: PollLogsRequest
|
|
110
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
|
|
111
|
+
current_request = request
|
|
112
|
+
previous_next_token = request.next_token
|
|
113
|
+
|
|
114
|
+
for attempt in range(self.MAX_RETRIES):
|
|
115
|
+
cw_events, next_token = self._get_log_events(stream, current_request)
|
|
116
|
+
|
|
117
|
+
if cw_events:
|
|
118
|
+
return cw_events, next_token
|
|
119
|
+
|
|
120
|
+
if not next_token or next_token == previous_next_token:
|
|
121
|
+
return [], None
|
|
122
|
+
|
|
123
|
+
previous_next_token = next_token
|
|
124
|
+
current_request = PollLogsRequest(
|
|
125
|
+
run_name=request.run_name,
|
|
126
|
+
job_submission_id=request.job_submission_id,
|
|
127
|
+
start_time=request.start_time,
|
|
128
|
+
end_time=request.end_time,
|
|
129
|
+
descending=request.descending,
|
|
130
|
+
next_token=next_token,
|
|
131
|
+
limit=request.limit,
|
|
132
|
+
diagnose=request.diagnose,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not request.descending:
|
|
136
|
+
logger.debug(
|
|
137
|
+
"Stream %s: exhausted %d retries without finding logs, returning empty response",
|
|
138
|
+
stream,
|
|
139
|
+
self.MAX_RETRIES,
|
|
140
|
+
)
|
|
141
|
+
# Only return the next token after exhausting retries if going descending—
|
|
142
|
+
# AWS CloudWatch guarantees more logs in that case. In ascending mode,
|
|
143
|
+
# next token is always returned, even if no logs remain.
|
|
144
|
+
# So descending works reliably; ascending has limits if gaps are too large.
|
|
145
|
+
# In the future, UI/CLI should handle retries, and we can return next token for ascending too.
|
|
146
|
+
return [], next_token if request.descending else None
|
|
106
147
|
|
|
107
148
|
def _get_log_events(
|
|
108
149
|
self, stream: str, request: PollLogsRequest
|
|
@@ -116,7 +157,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
116
157
|
}
|
|
117
158
|
|
|
118
159
|
if request.start_time:
|
|
119
|
-
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
|
|
160
|
+
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
|
|
120
161
|
|
|
121
162
|
if request.end_time:
|
|
122
163
|
parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
|
|
@@ -238,8 +279,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
238
279
|
skipped_future_events += 1
|
|
239
280
|
continue
|
|
240
281
|
cw_event = self._runner_log_event_to_cloudwatch_event(event)
|
|
241
|
-
|
|
242
|
-
message_size = len(cw_event["message"]) + self.MESSAGE_OVERHEAD_SIZE
|
|
282
|
+
message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
|
|
243
283
|
if message_size > self.MESSAGE_MAX_SIZE:
|
|
244
284
|
# we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
|
|
245
285
|
# which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
|
|
@@ -271,7 +311,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
271
311
|
) -> _CloudWatchLogEvent:
|
|
272
312
|
return {
|
|
273
313
|
"timestamp": runner_log_event.timestamp,
|
|
274
|
-
"message":
|
|
314
|
+
"message": runner_log_event.message.decode(errors="replace"),
|
|
275
315
|
}
|
|
276
316
|
|
|
277
317
|
@contextmanager
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from pathlib import Path
|
|
2
|
-
from typing import List, Union
|
|
3
|
+
from typing import Generator, List, Optional, Tuple, Union
|
|
3
4
|
from uuid import UUID
|
|
4
5
|
|
|
6
|
+
from dstack._internal.core.errors import ServerClientError
|
|
5
7
|
from dstack._internal.core.models.logs import (
|
|
6
8
|
JobSubmissionLogs,
|
|
7
9
|
LogEvent,
|
|
@@ -14,8 +16,6 @@ from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
|
14
16
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
15
17
|
from dstack._internal.server.services.logs.base import (
|
|
16
18
|
LogStorage,
|
|
17
|
-
LogStorageError,
|
|
18
|
-
b64encode_raw_message,
|
|
19
19
|
unix_time_ms_to_datetime,
|
|
20
20
|
)
|
|
21
21
|
|
|
@@ -30,9 +30,6 @@ class FileLogStorage(LogStorage):
|
|
|
30
30
|
self.root = Path(root)
|
|
31
31
|
|
|
32
32
|
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
33
|
-
if request.descending:
|
|
34
|
-
raise LogStorageError("descending: true is not supported")
|
|
35
|
-
|
|
36
33
|
log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
37
34
|
log_file_path = self._get_log_file_path(
|
|
38
35
|
project_name=project.name,
|
|
@@ -41,18 +38,17 @@ class FileLogStorage(LogStorage):
|
|
|
41
38
|
producer=log_producer,
|
|
42
39
|
)
|
|
43
40
|
|
|
41
|
+
if request.descending:
|
|
42
|
+
return self._poll_logs_descending(log_file_path, request)
|
|
43
|
+
else:
|
|
44
|
+
return self._poll_logs_ascending(log_file_path, request)
|
|
45
|
+
|
|
46
|
+
def _poll_logs_ascending(
|
|
47
|
+
self, log_file_path: Path, request: PollLogsRequest
|
|
48
|
+
) -> JobSubmissionLogs:
|
|
44
49
|
start_line = 0
|
|
45
50
|
if request.next_token:
|
|
46
|
-
|
|
47
|
-
start_line = int(request.next_token)
|
|
48
|
-
if start_line < 0:
|
|
49
|
-
raise LogStorageError(
|
|
50
|
-
f"Invalid next_token: {request.next_token}. Must be a non-negative integer."
|
|
51
|
-
)
|
|
52
|
-
except ValueError:
|
|
53
|
-
raise LogStorageError(
|
|
54
|
-
f"Invalid next_token: {request.next_token}. Must be a valid integer."
|
|
55
|
-
)
|
|
51
|
+
start_line = self._next_token(request)
|
|
56
52
|
|
|
57
53
|
logs = []
|
|
58
54
|
next_token = None
|
|
@@ -60,34 +56,140 @@ class FileLogStorage(LogStorage):
|
|
|
60
56
|
|
|
61
57
|
try:
|
|
62
58
|
with open(log_file_path) as f:
|
|
63
|
-
|
|
59
|
+
# Skip to start_line if needed
|
|
60
|
+
for _ in range(start_line):
|
|
61
|
+
if f.readline() == "":
|
|
62
|
+
# File is shorter than start_line
|
|
63
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
64
|
+
current_line += 1
|
|
65
|
+
|
|
66
|
+
# Read lines one by one
|
|
67
|
+
while True:
|
|
68
|
+
line = f.readline()
|
|
69
|
+
if line == "": # EOF
|
|
70
|
+
break
|
|
64
71
|
|
|
65
|
-
for i, line in enumerate(lines):
|
|
66
|
-
if current_line < start_line:
|
|
67
72
|
current_line += 1
|
|
68
|
-
continue
|
|
69
73
|
|
|
70
|
-
|
|
71
|
-
|
|
74
|
+
try:
|
|
75
|
+
log_event = LogEvent.__response__.parse_raw(line)
|
|
76
|
+
except Exception:
|
|
77
|
+
# Skip malformed lines
|
|
78
|
+
continue
|
|
72
79
|
|
|
73
|
-
|
|
80
|
+
if request.start_time and log_event.timestamp <= request.start_time:
|
|
81
|
+
continue
|
|
82
|
+
if request.end_time is not None and log_event.timestamp >= request.end_time:
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
logs.append(log_event)
|
|
86
|
+
|
|
87
|
+
if len(logs) >= request.limit:
|
|
88
|
+
# Check if there are more lines to read
|
|
89
|
+
if f.readline() != "":
|
|
90
|
+
next_token = str(current_line)
|
|
91
|
+
break
|
|
92
|
+
except FileNotFoundError:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
96
|
+
|
|
97
|
+
def _poll_logs_descending(
|
|
98
|
+
self, log_file_path: Path, request: PollLogsRequest
|
|
99
|
+
) -> JobSubmissionLogs:
|
|
100
|
+
start_offset = self._next_token(request)
|
|
101
|
+
|
|
102
|
+
candidate_logs = []
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
line_generator = self._read_lines_reversed(log_file_path, start_offset)
|
|
106
|
+
|
|
107
|
+
for line_bytes, line_start_offset in line_generator:
|
|
108
|
+
try:
|
|
109
|
+
line_str = line_bytes.decode("utf-8")
|
|
110
|
+
log_event = LogEvent.__response__.parse_raw(line_str)
|
|
111
|
+
except Exception:
|
|
112
|
+
continue # Skip malformed lines
|
|
113
|
+
|
|
114
|
+
if request.end_time is not None and log_event.timestamp > request.end_time:
|
|
74
115
|
continue
|
|
75
|
-
if request.
|
|
116
|
+
if request.start_time and log_event.timestamp <= request.start_time:
|
|
76
117
|
break
|
|
77
118
|
|
|
78
|
-
|
|
119
|
+
candidate_logs.append((log_event, line_start_offset))
|
|
79
120
|
|
|
80
|
-
if len(
|
|
81
|
-
# Only set next_token if there are more lines to read
|
|
82
|
-
if current_line < len(lines):
|
|
83
|
-
next_token = str(current_line)
|
|
121
|
+
if len(candidate_logs) > request.limit:
|
|
84
122
|
break
|
|
123
|
+
except FileNotFoundError:
|
|
124
|
+
return JobSubmissionLogs(logs=[], next_token=None)
|
|
85
125
|
|
|
86
|
-
|
|
87
|
-
|
|
126
|
+
logs = [log for log, offset in candidate_logs[: request.limit]]
|
|
127
|
+
next_token = None
|
|
128
|
+
if len(candidate_logs) > request.limit:
|
|
129
|
+
# We fetched one more than the limit, so there are more pages.
|
|
130
|
+
# The next token should point to the start of the last log we are returning.
|
|
131
|
+
_last_log_event, last_log_offset = candidate_logs[request.limit - 1]
|
|
132
|
+
next_token = str(last_log_offset)
|
|
88
133
|
|
|
89
134
|
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
90
135
|
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _read_lines_reversed(
|
|
138
|
+
filepath: Path, start_offset: Optional[int] = None, chunk_size: int = 8192
|
|
139
|
+
) -> Generator[Tuple[bytes, int], None, None]:
|
|
140
|
+
"""
|
|
141
|
+
A generator that yields lines from a file in reverse order, along with the byte
|
|
142
|
+
offset of the start of each line. This is memory-efficient for large files.
|
|
143
|
+
"""
|
|
144
|
+
with open(filepath, "rb") as f:
|
|
145
|
+
f.seek(0, os.SEEK_END)
|
|
146
|
+
file_size = f.tell()
|
|
147
|
+
cursor = file_size
|
|
148
|
+
|
|
149
|
+
# If a start_offset is provided, optimize by starting the read
|
|
150
|
+
# from a more specific location instead of the end of the file.
|
|
151
|
+
if start_offset is not None and start_offset < file_size:
|
|
152
|
+
# To get the full content of the line that straddles the offset,
|
|
153
|
+
# we need to find its end (the next newline character).
|
|
154
|
+
f.seek(start_offset)
|
|
155
|
+
chunk = f.read(chunk_size)
|
|
156
|
+
newline_pos = chunk.find(b"\n")
|
|
157
|
+
if newline_pos != -1:
|
|
158
|
+
# Found the end of the line. The cursor for reverse reading
|
|
159
|
+
# should start from this point to include the full line.
|
|
160
|
+
cursor = start_offset + newline_pos + 1
|
|
161
|
+
else:
|
|
162
|
+
# No newline found, which means the rest of the file is one line.
|
|
163
|
+
# The default cursor pointing to file_size is correct.
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
buffer = b""
|
|
167
|
+
|
|
168
|
+
while cursor > 0:
|
|
169
|
+
seek_pos = max(0, cursor - chunk_size)
|
|
170
|
+
amount_to_read = cursor - seek_pos
|
|
171
|
+
f.seek(seek_pos)
|
|
172
|
+
chunk = f.read(amount_to_read)
|
|
173
|
+
cursor = seek_pos
|
|
174
|
+
|
|
175
|
+
buffer = chunk + buffer
|
|
176
|
+
|
|
177
|
+
while b"\n" in buffer:
|
|
178
|
+
newline_pos = buffer.rfind(b"\n")
|
|
179
|
+
line = buffer[newline_pos + 1 :]
|
|
180
|
+
line_start_offset = cursor + newline_pos + 1
|
|
181
|
+
|
|
182
|
+
# Skip lines that start at or after the start_offset
|
|
183
|
+
if start_offset is None or line_start_offset < start_offset:
|
|
184
|
+
yield line, line_start_offset
|
|
185
|
+
|
|
186
|
+
buffer = buffer[:newline_pos]
|
|
187
|
+
|
|
188
|
+
# The remaining buffer is the first line of the file.
|
|
189
|
+
# Only yield it if we're not using start_offset or if it starts before start_offset
|
|
190
|
+
if buffer and (start_offset is None or 0 < start_offset):
|
|
191
|
+
yield buffer, 0
|
|
192
|
+
|
|
91
193
|
def write_logs(
|
|
92
194
|
self,
|
|
93
195
|
project: ProjectModel,
|
|
@@ -140,5 +242,19 @@ class FileLogStorage(LogStorage):
|
|
|
140
242
|
return LogEvent(
|
|
141
243
|
timestamp=unix_time_ms_to_datetime(runner_log_event.timestamp),
|
|
142
244
|
log_source=LogEventSource.STDOUT,
|
|
143
|
-
message=
|
|
245
|
+
message=runner_log_event.message.decode(errors="replace"),
|
|
144
246
|
)
|
|
247
|
+
|
|
248
|
+
def _next_token(self, request: PollLogsRequest) -> Optional[int]:
|
|
249
|
+
next_token = request.next_token
|
|
250
|
+
if next_token is None:
|
|
251
|
+
return None
|
|
252
|
+
try:
|
|
253
|
+
value = int(next_token)
|
|
254
|
+
if value < 0:
|
|
255
|
+
raise ValueError("Offset must be non-negative")
|
|
256
|
+
return value
|
|
257
|
+
except (ValueError, TypeError):
|
|
258
|
+
raise ServerClientError(
|
|
259
|
+
f"Invalid next_token: {next_token}. Must be a non-negative integer."
|
|
260
|
+
)
|
|
@@ -14,7 +14,6 @@ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
|
14
14
|
from dstack._internal.server.services.logs.base import (
|
|
15
15
|
LogStorage,
|
|
16
16
|
LogStorageError,
|
|
17
|
-
b64encode_raw_message,
|
|
18
17
|
unix_time_ms_to_datetime,
|
|
19
18
|
)
|
|
20
19
|
from dstack._internal.utils.common import batched
|
|
@@ -137,15 +136,14 @@ class GCPLogStorage(LogStorage):
|
|
|
137
136
|
with self.logger.batch() as batcher:
|
|
138
137
|
for batch in batched(logs, self.MAX_BATCH_SIZE):
|
|
139
138
|
for log in batch:
|
|
140
|
-
message =
|
|
139
|
+
message = log.message.decode(errors="replace")
|
|
141
140
|
timestamp = unix_time_ms_to_datetime(log.timestamp)
|
|
142
|
-
|
|
143
|
-
if len(message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
141
|
+
if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
144
142
|
logger.error(
|
|
145
143
|
"Stream %s: skipping event at %s, message exceeds max size: %d > %d",
|
|
146
144
|
stream_name,
|
|
147
145
|
timestamp.isoformat(),
|
|
148
|
-
len(message),
|
|
146
|
+
len(log.message),
|
|
149
147
|
self.MAX_RUNNER_MESSAGE_SIZE,
|
|
150
148
|
)
|
|
151
149
|
continue
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
import json
|
|
2
3
|
from collections import defaultdict
|
|
3
4
|
from collections.abc import Generator, Iterable
|
|
4
5
|
from datetime import timezone
|
|
@@ -177,6 +178,19 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
177
178
|
metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
|
|
178
179
|
metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
|
|
179
180
|
metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
|
|
181
|
+
if gpus:
|
|
182
|
+
gpu_memory_total = gpus[0].memory_mib * 1024 * 1024
|
|
183
|
+
for gpu_num, (gpu_util, gpu_memory_usage) in enumerate(
|
|
184
|
+
zip(
|
|
185
|
+
json.loads(jmp.gpus_util_percent),
|
|
186
|
+
json.loads(jmp.gpus_memory_usage_bytes),
|
|
187
|
+
)
|
|
188
|
+
):
|
|
189
|
+
gpu_labels = labels.copy()
|
|
190
|
+
gpu_labels["dstack_gpu_num"] = gpu_num
|
|
191
|
+
metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100)
|
|
192
|
+
metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total)
|
|
193
|
+
metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage)
|
|
180
194
|
jpm = job_prometheus_metrics.get(job.id)
|
|
181
195
|
if jpm is not None:
|
|
182
196
|
for metric in text_string_to_metric_families(jpm.text):
|
|
@@ -202,6 +216,9 @@ _JOB_CPU_TIME = "dstack_job_cpu_time_seconds_total"
|
|
|
202
216
|
_JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
|
|
203
217
|
_JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
|
|
204
218
|
_JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
|
|
219
|
+
_JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio"
|
|
220
|
+
_JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes"
|
|
221
|
+
_JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes"
|
|
205
222
|
|
|
206
223
|
|
|
207
224
|
class _Metrics(dict[str, Metric]):
|
|
@@ -259,6 +276,9 @@ class _JobMetrics(_Metrics):
|
|
|
259
276
|
(_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
|
|
260
277
|
(_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
|
|
261
278
|
(_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
|
|
279
|
+
(_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"),
|
|
280
|
+
(_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"),
|
|
281
|
+
(_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"),
|
|
262
282
|
]
|
|
263
283
|
|
|
264
284
|
|
|
@@ -12,10 +12,12 @@ from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
|
12
12
|
from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
|
|
13
13
|
from dstack._internal.core.models.runs import (
|
|
14
14
|
JobProvisioningData,
|
|
15
|
+
JobSpec,
|
|
15
16
|
JobStatus,
|
|
16
17
|
RunSpec,
|
|
17
18
|
RunStatus,
|
|
18
19
|
ServiceSpec,
|
|
20
|
+
get_service_port,
|
|
19
21
|
)
|
|
20
22
|
from dstack._internal.core.models.services import AnyModel
|
|
21
23
|
from dstack._internal.proxy.lib.models import (
|
|
@@ -97,9 +99,10 @@ class ServerProxyRepo(BaseProxyRepo):
|
|
|
97
99
|
if rci.ssh_proxy is not None:
|
|
98
100
|
ssh_head_proxy = rci.ssh_proxy
|
|
99
101
|
ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
|
|
102
|
+
job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
|
|
100
103
|
replica = Replica(
|
|
101
104
|
id=job.id.hex,
|
|
102
|
-
app_port=run_spec.configuration
|
|
105
|
+
app_port=get_service_port(job_spec, run_spec.configuration),
|
|
103
106
|
ssh_destination=ssh_destination,
|
|
104
107
|
ssh_port=ssh_port,
|
|
105
108
|
ssh_proxy=ssh_proxy,
|