dstack 0.19.25__py3-none-any.whl → 0.19.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -2
- dstack/_internal/cli/commands/apply.py +3 -61
- dstack/_internal/cli/commands/attach.py +1 -1
- dstack/_internal/cli/commands/completion.py +1 -1
- dstack/_internal/cli/commands/delete.py +2 -2
- dstack/_internal/cli/commands/fleet.py +1 -1
- dstack/_internal/cli/commands/gateway.py +2 -2
- dstack/_internal/cli/commands/init.py +56 -24
- dstack/_internal/cli/commands/logs.py +1 -1
- dstack/_internal/cli/commands/metrics.py +1 -1
- dstack/_internal/cli/commands/offer.py +45 -7
- dstack/_internal/cli/commands/project.py +2 -2
- dstack/_internal/cli/commands/secrets.py +2 -2
- dstack/_internal/cli/commands/server.py +1 -1
- dstack/_internal/cli/commands/stop.py +1 -1
- dstack/_internal/cli/commands/volume.py +1 -1
- dstack/_internal/cli/main.py +2 -2
- dstack/_internal/cli/services/completion.py +2 -2
- dstack/_internal/cli/services/configurators/__init__.py +6 -2
- dstack/_internal/cli/services/configurators/base.py +6 -7
- dstack/_internal/cli/services/configurators/fleet.py +1 -3
- dstack/_internal/cli/services/configurators/gateway.py +2 -4
- dstack/_internal/cli/services/configurators/run.py +195 -55
- dstack/_internal/cli/services/configurators/volume.py +2 -4
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/cli/services/repos.py +51 -47
- dstack/_internal/core/backends/aws/configurator.py +11 -7
- dstack/_internal/core/backends/azure/configurator.py +11 -7
- dstack/_internal/core/backends/base/configurator.py +25 -13
- dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
- dstack/_internal/core/backends/cudo/configurator.py +11 -7
- dstack/_internal/core/backends/datacrunch/compute.py +5 -1
- dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
- dstack/_internal/core/backends/gcp/configurator.py +11 -7
- dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
- dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
- dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/compute.py +1 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/resources.py +21 -11
- dstack/_internal/core/backends/oci/configurator.py +11 -7
- dstack/_internal/core/backends/runpod/configurator.py +11 -7
- dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
- dstack/_internal/core/backends/tensordock/configurator.py +13 -7
- dstack/_internal/core/backends/vastai/configurator.py +11 -7
- dstack/_internal/core/backends/vultr/configurator.py +11 -4
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/runs.py +1 -0
- dstack/_internal/core/models/common.py +3 -3
- dstack/_internal/core/models/configurations.py +172 -27
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +5 -1
- dstack/_internal/core/models/profiles.py +41 -11
- dstack/_internal/core/models/resources.py +46 -42
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/configs/__init__.py +2 -2
- dstack/_internal/core/services/profiles.py +2 -2
- dstack/_internal/core/services/repos.py +5 -3
- dstack/_internal/core/services/ssh/ports.py +1 -1
- dstack/_internal/proxy/lib/deps.py +6 -2
- dstack/_internal/server/app.py +22 -17
- dstack/_internal/server/background/tasks/process_gateways.py +4 -1
- dstack/_internal/server/background/tasks/process_instances.py +10 -2
- dstack/_internal/server/background/tasks/process_probes.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_runs.py +1 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/db.py +8 -4
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/gpus.py +1 -6
- dstack/_internal/server/schemas/runner.py +10 -0
- dstack/_internal/server/services/backends/__init__.py +14 -8
- dstack/_internal/server/services/backends/handlers.py +6 -1
- dstack/_internal/server/services/docker.py +5 -5
- dstack/_internal/server/services/fleets.py +14 -13
- dstack/_internal/server/services/gateways/__init__.py +2 -0
- dstack/_internal/server/services/gateways/client.py +5 -2
- dstack/_internal/server/services/gateways/connection.py +1 -1
- dstack/_internal/server/services/gpus.py +50 -49
- dstack/_internal/server/services/instances.py +41 -1
- dstack/_internal/server/services/jobs/__init__.py +15 -4
- dstack/_internal/server/services/jobs/configurators/base.py +7 -11
- dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
- dstack/_internal/server/services/jobs/configurators/service.py +1 -0
- dstack/_internal/server/services/jobs/configurators/task.py +3 -0
- dstack/_internal/server/services/locking.py +5 -5
- dstack/_internal/server/services/logging.py +10 -2
- dstack/_internal/server/services/logs/__init__.py +8 -6
- dstack/_internal/server/services/logs/aws.py +330 -327
- dstack/_internal/server/services/logs/filelog.py +7 -6
- dstack/_internal/server/services/logs/gcp.py +141 -139
- dstack/_internal/server/services/plugins.py +1 -1
- dstack/_internal/server/services/projects.py +2 -5
- dstack/_internal/server/services/proxy/repo.py +5 -1
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +259 -0
- dstack/_internal/server/services/runner/client.py +7 -0
- dstack/_internal/server/services/runs.py +1 -1
- dstack/_internal/server/services/services/__init__.py +8 -2
- dstack/_internal/server/services/services/autoscalers.py +2 -0
- dstack/_internal/server/services/ssh.py +2 -1
- dstack/_internal/server/services/storage/__init__.py +5 -6
- dstack/_internal/server/services/storage/gcs.py +49 -49
- dstack/_internal/server/services/storage/s3.py +52 -52
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/testing/common.py +1 -1
- dstack/_internal/server/utils/logging.py +3 -3
- dstack/_internal/server/utils/provisioning.py +3 -3
- dstack/_internal/utils/json_schema.py +3 -1
- dstack/_internal/utils/typing.py +14 -0
- dstack/api/_public/repos.py +21 -2
- dstack/api/_public/runs.py +5 -7
- dstack/api/server/__init__.py +17 -19
- dstack/api/server/_gpus.py +2 -1
- dstack/api/server/_group.py +4 -3
- dstack/api/server/_repos.py +20 -3
- dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
- dstack/version.py +1 -1
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
- dstack/api/huggingface/__init__.py +0 -73
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -24,347 +24,350 @@ from dstack._internal.server.services.logs.base import (
|
|
|
24
24
|
)
|
|
25
25
|
from dstack._internal.utils.logging import get_logger
|
|
26
26
|
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
27
30
|
BOTO_AVAILABLE = True
|
|
28
31
|
try:
|
|
29
32
|
import boto3
|
|
30
33
|
import botocore.exceptions
|
|
31
34
|
except ImportError:
|
|
32
35
|
BOTO_AVAILABLE = False
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
self.
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
81
|
-
stream = self._get_stream_name(
|
|
82
|
-
project.name, request.run_name, request.job_submission_id, log_producer
|
|
83
|
-
)
|
|
84
|
-
cw_events: List[_CloudWatchLogEvent]
|
|
85
|
-
next_token: Optional[str] = None
|
|
86
|
-
with self._wrap_boto_errors():
|
|
87
|
-
try:
|
|
88
|
-
cw_events, next_token = self._get_log_events_with_retry(stream, request)
|
|
89
|
-
except botocore.exceptions.ClientError as e:
|
|
90
|
-
if not self._is_resource_not_found_exception(e):
|
|
91
|
-
raise
|
|
92
|
-
# Check if the group exists to distinguish between group not found vs stream not found
|
|
93
|
-
try:
|
|
94
|
-
self._check_group_exists(self._group)
|
|
95
|
-
# Group exists, so the error must be due to missing stream
|
|
96
|
-
logger.debug("Stream %s not found, returning dummy response", stream)
|
|
97
|
-
cw_events = []
|
|
98
|
-
except LogStorageError:
|
|
99
|
-
# Group doesn't exist, re-raise the LogStorageError
|
|
100
|
-
raise
|
|
101
|
-
logs = [
|
|
102
|
-
LogEvent(
|
|
103
|
-
timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
|
|
104
|
-
log_source=LogEventSource.STDOUT,
|
|
105
|
-
message=cw_event["message"],
|
|
36
|
+
else:
|
|
37
|
+
|
|
38
|
+
class _CloudWatchLogEvent(TypedDict):
|
|
39
|
+
timestamp: int # unix time in milliseconds
|
|
40
|
+
message: str
|
|
41
|
+
|
|
42
|
+
class CloudWatchLogStorage(LogStorage):
|
|
43
|
+
# "The maximum number of log events in a batch is 10,000".
|
|
44
|
+
EVENT_MAX_COUNT_IN_BATCH = 10000
|
|
45
|
+
# "The maximum batch size is 1,048,576 bytes" — exactly 1 MiB. "This size is calculated
|
|
46
|
+
# as the sum of all event messages in UTF-8, plus 26 bytes for each log event".
|
|
47
|
+
BATCH_MAX_SIZE = 1048576
|
|
48
|
+
# "Each log event can be no larger than 256 KB" — KB means KiB; includes MESSAGE_OVERHEAD_SIZE.
|
|
49
|
+
MESSAGE_MAX_SIZE = 262144
|
|
50
|
+
# Message size in bytes = len(message.encode("utf-8")) + MESSAGE_OVERHEAD_SIZE.
|
|
51
|
+
MESSAGE_OVERHEAD_SIZE = 26
|
|
52
|
+
# "A batch of log events in a single request cannot span more than 24 hours".
|
|
53
|
+
BATCH_MAX_SPAN = int(timedelta(hours=24).total_seconds()) * 1000
|
|
54
|
+
# Decrease allowed deltas by possible clock drift between dstack and CloudWatch.
|
|
55
|
+
CLOCK_DRIFT = int(timedelta(minutes=10).total_seconds()) * 1000
|
|
56
|
+
# "None of the log events in the batch can be more than 14 days in the past."
|
|
57
|
+
PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
|
|
58
|
+
# "None of the log events in the batch can be more than 2 hours in the future."
|
|
59
|
+
FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
|
|
60
|
+
# Maximum number of retries when polling for log events to skip empty pages.
|
|
61
|
+
MAX_RETRIES = 10
|
|
62
|
+
|
|
63
|
+
def __init__(self, *, group: str, region: Optional[str] = None) -> None:
|
|
64
|
+
with self._wrap_boto_errors():
|
|
65
|
+
session = boto3.Session(region_name=region)
|
|
66
|
+
self._client = session.client("logs")
|
|
67
|
+
self._check_group_exists(group)
|
|
68
|
+
self._group = group
|
|
69
|
+
self._region = self._client.meta.region_name
|
|
70
|
+
# Stores names of already created streams.
|
|
71
|
+
# XXX: This set acts as an unbound cache. If this becomes a problem (in case of _very_ long
|
|
72
|
+
# running server and/or lots of jobs, consider replacing it with an LRU cache, e.g.,
|
|
73
|
+
# a simple OrderedDict-based implementation should be OK.
|
|
74
|
+
self._streams: Set[str] = set()
|
|
75
|
+
|
|
76
|
+
def close(self) -> None:
|
|
77
|
+
self._client.close()
|
|
78
|
+
|
|
79
|
+
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
80
|
+
log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
81
|
+
stream = self._get_stream_name(
|
|
82
|
+
project.name, request.run_name, request.job_submission_id, log_producer
|
|
106
83
|
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
end_time=request.end_time,
|
|
136
|
-
descending=request.descending,
|
|
84
|
+
cw_events: List[_CloudWatchLogEvent]
|
|
85
|
+
next_token: Optional[str] = None
|
|
86
|
+
with self._wrap_boto_errors():
|
|
87
|
+
try:
|
|
88
|
+
cw_events, next_token = self._get_log_events_with_retry(stream, request)
|
|
89
|
+
except botocore.exceptions.ClientError as e:
|
|
90
|
+
if not self._is_resource_not_found_exception(e):
|
|
91
|
+
raise
|
|
92
|
+
# Check if the group exists to distinguish between group not found vs stream not found
|
|
93
|
+
try:
|
|
94
|
+
self._check_group_exists(self._group)
|
|
95
|
+
# Group exists, so the error must be due to missing stream
|
|
96
|
+
logger.debug("Stream %s not found, returning dummy response", stream)
|
|
97
|
+
cw_events = []
|
|
98
|
+
except LogStorageError:
|
|
99
|
+
# Group doesn't exist, re-raise the LogStorageError
|
|
100
|
+
raise
|
|
101
|
+
logs = [
|
|
102
|
+
LogEvent(
|
|
103
|
+
timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
|
|
104
|
+
log_source=LogEventSource.STDOUT,
|
|
105
|
+
message=cw_event["message"],
|
|
106
|
+
)
|
|
107
|
+
for cw_event in cw_events
|
|
108
|
+
]
|
|
109
|
+
return JobSubmissionLogs(
|
|
110
|
+
logs=logs,
|
|
111
|
+
external_url=self._get_stream_external_url(stream),
|
|
137
112
|
next_token=next_token,
|
|
138
|
-
limit=request.limit,
|
|
139
|
-
diagnose=request.diagnose,
|
|
140
113
|
)
|
|
141
114
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if request.end_time:
|
|
170
|
-
parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
|
|
171
|
-
elif start_from_head:
|
|
172
|
-
# When startFromHead=true and no endTime is provided, set endTime to "now"
|
|
173
|
-
# to prevent infinite pagination as new logs arrive faster than we can read them
|
|
174
|
-
parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
|
|
175
|
-
|
|
176
|
-
if request.next_token:
|
|
177
|
-
parameters["nextToken"] = request.next_token
|
|
178
|
-
|
|
179
|
-
response = self._client.get_log_events(**parameters)
|
|
180
|
-
|
|
181
|
-
events = response.get("events", [])
|
|
182
|
-
next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
|
|
183
|
-
next_token = response.get(next_token_key)
|
|
184
|
-
|
|
185
|
-
# TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
|
|
186
|
-
if request.descending:
|
|
187
|
-
events = list(reversed(events))
|
|
188
|
-
|
|
189
|
-
return events, next_token
|
|
190
|
-
|
|
191
|
-
def _get_stream_external_url(self, stream: str) -> str:
|
|
192
|
-
quoted_group = urllib.parse.quote(self._group, safe="")
|
|
193
|
-
quoted_stream = urllib.parse.quote(stream, safe="")
|
|
194
|
-
return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}"
|
|
195
|
-
|
|
196
|
-
def write_logs(
|
|
197
|
-
self,
|
|
198
|
-
project: ProjectModel,
|
|
199
|
-
run_name: str,
|
|
200
|
-
job_submission_id: UUID,
|
|
201
|
-
runner_logs: List[RunnerLogEvent],
|
|
202
|
-
job_logs: List[RunnerLogEvent],
|
|
203
|
-
):
|
|
204
|
-
if len(runner_logs) > 0:
|
|
205
|
-
runner_stream = self._get_stream_name(
|
|
206
|
-
project.name, run_name, job_submission_id, LogProducer.RUNNER
|
|
207
|
-
)
|
|
208
|
-
self._write_logs(
|
|
209
|
-
stream=runner_stream,
|
|
210
|
-
log_events=runner_logs,
|
|
211
|
-
)
|
|
212
|
-
if len(job_logs) > 0:
|
|
213
|
-
jog_stream = self._get_stream_name(
|
|
214
|
-
project.name, run_name, job_submission_id, LogProducer.JOB
|
|
215
|
-
)
|
|
216
|
-
self._write_logs(
|
|
217
|
-
stream=jog_stream,
|
|
218
|
-
log_events=job_logs,
|
|
219
|
-
)
|
|
115
|
+
def _get_log_events_with_retry(
|
|
116
|
+
self, stream: str, request: PollLogsRequest
|
|
117
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
|
|
118
|
+
current_request = request
|
|
119
|
+
previous_next_token = request.next_token
|
|
120
|
+
next_token = None
|
|
121
|
+
|
|
122
|
+
for _ in range(self.MAX_RETRIES):
|
|
123
|
+
cw_events, next_token = self._get_log_events(stream, current_request)
|
|
124
|
+
|
|
125
|
+
if cw_events:
|
|
126
|
+
return cw_events, next_token
|
|
127
|
+
|
|
128
|
+
if not next_token or next_token == previous_next_token:
|
|
129
|
+
return [], None
|
|
130
|
+
|
|
131
|
+
previous_next_token = next_token
|
|
132
|
+
current_request = PollLogsRequest(
|
|
133
|
+
run_name=request.run_name,
|
|
134
|
+
job_submission_id=request.job_submission_id,
|
|
135
|
+
start_time=request.start_time,
|
|
136
|
+
end_time=request.end_time,
|
|
137
|
+
descending=request.descending,
|
|
138
|
+
next_token=next_token,
|
|
139
|
+
limit=request.limit,
|
|
140
|
+
diagnose=request.diagnose,
|
|
141
|
+
)
|
|
220
142
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
143
|
+
if not request.descending:
|
|
144
|
+
logger.debug(
|
|
145
|
+
"Stream %s: exhausted %d retries without finding logs, returning empty response",
|
|
146
|
+
stream,
|
|
147
|
+
self.MAX_RETRIES,
|
|
148
|
+
)
|
|
149
|
+
# Only return the next token after exhausting retries if going descending—
|
|
150
|
+
# AWS CloudWatch guarantees more logs in that case. In ascending mode,
|
|
151
|
+
# next token is always returned, even if no logs remain.
|
|
152
|
+
# So descending works reliably; ascending has limits if gaps are too large.
|
|
153
|
+
# In the future, UI/CLI should handle retries, and we can return next token for ascending too.
|
|
154
|
+
return [], next_token if request.descending else None
|
|
155
|
+
|
|
156
|
+
def _get_log_events(
|
|
157
|
+
self, stream: str, request: PollLogsRequest
|
|
158
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
|
|
159
|
+
start_from_head = not request.descending
|
|
160
|
+
parameters = {
|
|
161
|
+
"logGroupName": self._group,
|
|
162
|
+
"logStreamName": stream,
|
|
163
|
+
"limit": request.limit,
|
|
164
|
+
"startFromHead": start_from_head,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if request.start_time:
|
|
168
|
+
parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
|
|
169
|
+
|
|
170
|
+
if request.end_time:
|
|
171
|
+
parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
|
|
172
|
+
elif start_from_head:
|
|
173
|
+
# When startFromHead=true and no endTime is provided, set endTime to "now"
|
|
174
|
+
# to prevent infinite pagination as new logs arrive faster than we can read them
|
|
175
|
+
parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
|
|
176
|
+
|
|
177
|
+
if request.next_token:
|
|
178
|
+
parameters["nextToken"] = request.next_token
|
|
179
|
+
|
|
180
|
+
response = self._client.get_log_events(**parameters)
|
|
181
|
+
|
|
182
|
+
events = response.get("events", [])
|
|
183
|
+
next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
|
|
184
|
+
next_token = response.get(next_token_key)
|
|
185
|
+
|
|
186
|
+
# TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
|
|
187
|
+
if request.descending:
|
|
188
|
+
events = list(reversed(events))
|
|
189
|
+
|
|
190
|
+
return events, next_token
|
|
191
|
+
|
|
192
|
+
def _get_stream_external_url(self, stream: str) -> str:
|
|
193
|
+
quoted_group = urllib.parse.quote(self._group, safe="")
|
|
194
|
+
quoted_stream = urllib.parse.quote(stream, safe="")
|
|
195
|
+
return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}"
|
|
196
|
+
|
|
197
|
+
def write_logs(
|
|
198
|
+
self,
|
|
199
|
+
project: ProjectModel,
|
|
200
|
+
run_name: str,
|
|
201
|
+
job_submission_id: UUID,
|
|
202
|
+
runner_logs: List[RunnerLogEvent],
|
|
203
|
+
job_logs: List[RunnerLogEvent],
|
|
204
|
+
):
|
|
205
|
+
if len(runner_logs) > 0:
|
|
206
|
+
runner_stream = self._get_stream_name(
|
|
207
|
+
project.name, run_name, job_submission_id, LogProducer.RUNNER
|
|
208
|
+
)
|
|
209
|
+
self._write_logs(
|
|
210
|
+
stream=runner_stream,
|
|
211
|
+
log_events=runner_logs,
|
|
212
|
+
)
|
|
213
|
+
if len(job_logs) > 0:
|
|
214
|
+
jog_stream = self._get_stream_name(
|
|
215
|
+
project.name, run_name, job_submission_id, LogProducer.JOB
|
|
216
|
+
)
|
|
217
|
+
self._write_logs(
|
|
218
|
+
stream=jog_stream,
|
|
219
|
+
log_events=job_logs,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def _write_logs(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
|
|
223
|
+
with self._wrap_boto_errors():
|
|
224
|
+
self._ensure_stream_exists(stream)
|
|
225
|
+
try:
|
|
226
|
+
self._put_log_events(stream, log_events)
|
|
227
|
+
return
|
|
228
|
+
except botocore.exceptions.ClientError as e:
|
|
229
|
+
if not self._is_resource_not_found_exception(e):
|
|
230
|
+
raise
|
|
231
|
+
logger.debug("Stream %s not found, recreating", stream)
|
|
232
|
+
# The stream is probably deleted due to retention policy, our cache is stale.
|
|
233
|
+
self._ensure_stream_exists(stream, force=True)
|
|
225
234
|
self._put_log_events(stream, log_events)
|
|
226
|
-
return
|
|
227
|
-
except botocore.exceptions.ClientError as e:
|
|
228
|
-
if not self._is_resource_not_found_exception(e):
|
|
229
|
-
raise
|
|
230
|
-
logger.debug("Stream %s not found, recreating", stream)
|
|
231
|
-
# The stream is probably deleted due to retention policy, our cache is stale.
|
|
232
|
-
self._ensure_stream_exists(stream, force=True)
|
|
233
|
-
self._put_log_events(stream, log_events)
|
|
234
|
-
|
|
235
|
-
def _put_log_events(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
|
|
236
|
-
# Python docs: "The built-in sorted() function is guaranteed to be stable."
|
|
237
|
-
sorted_log_events = sorted(log_events, key=operator.attrgetter("timestamp"))
|
|
238
|
-
if tuple(map(id, log_events)) != tuple(map(id, sorted_log_events)):
|
|
239
|
-
logger.error(
|
|
240
|
-
"Stream %s: events are not in chronological order, something wrong with runner",
|
|
241
|
-
stream,
|
|
242
|
-
)
|
|
243
|
-
for batch in self._get_batch_iter(stream, sorted_log_events):
|
|
244
|
-
self._client.put_log_events(
|
|
245
|
-
logGroupName=self._group,
|
|
246
|
-
logStreamName=stream,
|
|
247
|
-
logEvents=batch,
|
|
248
|
-
)
|
|
249
235
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
event_iter = shared_event_iter
|
|
255
|
-
while True:
|
|
256
|
-
batch, excessive_event = self._get_next_batch(stream, event_iter)
|
|
257
|
-
if not batch:
|
|
258
|
-
return
|
|
259
|
-
yield batch
|
|
260
|
-
if excessive_event is not None:
|
|
261
|
-
event_iter = itertools.chain([excessive_event], shared_event_iter)
|
|
262
|
-
else:
|
|
263
|
-
event_iter = shared_event_iter
|
|
264
|
-
|
|
265
|
-
def _get_next_batch(
|
|
266
|
-
self, stream: str, event_iter: Iterator[RunnerLogEvent]
|
|
267
|
-
) -> Tuple[List[_CloudWatchLogEvent], Optional[RunnerLogEvent]]:
|
|
268
|
-
now_timestamp = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
269
|
-
batch: List[_CloudWatchLogEvent] = []
|
|
270
|
-
total_size = 0
|
|
271
|
-
event_count = 0
|
|
272
|
-
first_timestamp: Optional[int] = None
|
|
273
|
-
skipped_past_events = 0
|
|
274
|
-
skipped_future_events = 0
|
|
275
|
-
# event that doesn't fit in the current batch
|
|
276
|
-
excessive_event: Optional[RunnerLogEvent] = None
|
|
277
|
-
for event in event_iter:
|
|
278
|
-
# Normally there should not be empty messages.
|
|
279
|
-
if not event.message:
|
|
280
|
-
continue
|
|
281
|
-
timestamp = event.timestamp
|
|
282
|
-
if first_timestamp is None:
|
|
283
|
-
first_timestamp = timestamp
|
|
284
|
-
elif timestamp - first_timestamp > self.BATCH_MAX_SPAN:
|
|
285
|
-
excessive_event = event
|
|
286
|
-
break
|
|
287
|
-
if now_timestamp - timestamp > self.PAST_EVENT_MAX_DELTA:
|
|
288
|
-
skipped_past_events += 1
|
|
289
|
-
continue
|
|
290
|
-
if timestamp - now_timestamp > self.FUTURE_EVENT_MAX_DELTA:
|
|
291
|
-
skipped_future_events += 1
|
|
292
|
-
continue
|
|
293
|
-
cw_event = self._runner_log_event_to_cloudwatch_event(event)
|
|
294
|
-
message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
|
|
295
|
-
if message_size > self.MESSAGE_MAX_SIZE:
|
|
296
|
-
# we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
|
|
297
|
-
# which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
|
|
298
|
-
# `execJob` -> `io.Copy(logger, ptmx)`
|
|
236
|
+
def _put_log_events(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
|
|
237
|
+
# Python docs: "The built-in sorted() function is guaranteed to be stable."
|
|
238
|
+
sorted_log_events = sorted(log_events, key=operator.attrgetter("timestamp"))
|
|
239
|
+
if tuple(map(id, log_events)) != tuple(map(id, sorted_log_events)):
|
|
299
240
|
logger.error(
|
|
300
|
-
"Stream %s:
|
|
241
|
+
"Stream %s: events are not in chronological order, something wrong with runner",
|
|
301
242
|
stream,
|
|
302
|
-
timestamp,
|
|
303
|
-
message_size,
|
|
304
|
-
self.MESSAGE_MAX_SIZE,
|
|
305
243
|
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
self.
|
|
244
|
+
for batch in self._get_batch_iter(stream, sorted_log_events):
|
|
245
|
+
self._client.put_log_events(
|
|
246
|
+
logGroupName=self._group,
|
|
247
|
+
logStreamName=stream,
|
|
248
|
+
logEvents=batch,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def _get_batch_iter(
|
|
252
|
+
self, stream: str, log_events: List[RunnerLogEvent]
|
|
253
|
+
) -> Iterator[List[_CloudWatchLogEvent]]:
|
|
254
|
+
shared_event_iter = iter(log_events)
|
|
255
|
+
event_iter = shared_event_iter
|
|
256
|
+
while True:
|
|
257
|
+
batch, excessive_event = self._get_next_batch(stream, event_iter)
|
|
258
|
+
if not batch:
|
|
259
|
+
return
|
|
260
|
+
yield batch
|
|
261
|
+
if excessive_event is not None:
|
|
262
|
+
event_iter = itertools.chain([excessive_event], shared_event_iter)
|
|
263
|
+
else:
|
|
264
|
+
event_iter = shared_event_iter
|
|
265
|
+
|
|
266
|
+
def _get_next_batch(
|
|
267
|
+
self, stream: str, event_iter: Iterator[RunnerLogEvent]
|
|
268
|
+
) -> Tuple[List[_CloudWatchLogEvent], Optional[RunnerLogEvent]]:
|
|
269
|
+
now_timestamp = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
270
|
+
batch: List[_CloudWatchLogEvent] = []
|
|
271
|
+
total_size = 0
|
|
272
|
+
event_count = 0
|
|
273
|
+
first_timestamp: Optional[int] = None
|
|
274
|
+
skipped_past_events = 0
|
|
275
|
+
skipped_future_events = 0
|
|
276
|
+
# event that doesn't fit in the current batch
|
|
277
|
+
excessive_event: Optional[RunnerLogEvent] = None
|
|
278
|
+
for event in event_iter:
|
|
279
|
+
# Normally there should not be empty messages.
|
|
280
|
+
if not event.message:
|
|
281
|
+
continue
|
|
282
|
+
timestamp = event.timestamp
|
|
283
|
+
if first_timestamp is None:
|
|
284
|
+
first_timestamp = timestamp
|
|
285
|
+
elif timestamp - first_timestamp > self.BATCH_MAX_SPAN:
|
|
286
|
+
excessive_event = event
|
|
287
|
+
break
|
|
288
|
+
if now_timestamp - timestamp > self.PAST_EVENT_MAX_DELTA:
|
|
289
|
+
skipped_past_events += 1
|
|
290
|
+
continue
|
|
291
|
+
if timestamp - now_timestamp > self.FUTURE_EVENT_MAX_DELTA:
|
|
292
|
+
skipped_future_events += 1
|
|
293
|
+
continue
|
|
294
|
+
cw_event = self._runner_log_event_to_cloudwatch_event(event)
|
|
295
|
+
message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
|
|
296
|
+
if message_size > self.MESSAGE_MAX_SIZE:
|
|
297
|
+
# we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
|
|
298
|
+
# which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
|
|
299
|
+
# `execJob` -> `io.Copy(logger, ptmx)`
|
|
300
|
+
logger.error(
|
|
301
|
+
"Stream %s: skipping event %d, message exceeds max size: %d > %d",
|
|
302
|
+
stream,
|
|
303
|
+
timestamp,
|
|
304
|
+
message_size,
|
|
305
|
+
self.MESSAGE_MAX_SIZE,
|
|
306
|
+
)
|
|
307
|
+
continue
|
|
308
|
+
if total_size + message_size > self.BATCH_MAX_SIZE:
|
|
309
|
+
excessive_event = event
|
|
310
|
+
break
|
|
311
|
+
batch.append(cw_event)
|
|
312
|
+
total_size += message_size
|
|
313
|
+
event_count += 1
|
|
314
|
+
if event_count >= self.EVENT_MAX_COUNT_IN_BATCH:
|
|
315
|
+
break
|
|
316
|
+
if skipped_past_events > 0:
|
|
317
|
+
logger.error("Stream %s: skipping %d past event(s)", stream, skipped_past_events)
|
|
318
|
+
if skipped_future_events > 0:
|
|
319
|
+
logger.error(
|
|
320
|
+
"Stream %s: skipping %d future event(s)", stream, skipped_future_events
|
|
321
|
+
)
|
|
322
|
+
return batch, excessive_event
|
|
323
|
+
|
|
324
|
+
def _runner_log_event_to_cloudwatch_event(
|
|
325
|
+
self, runner_log_event: RunnerLogEvent
|
|
326
|
+
) -> _CloudWatchLogEvent:
|
|
327
|
+
return {
|
|
328
|
+
"timestamp": runner_log_event.timestamp,
|
|
329
|
+
"message": runner_log_event.message.decode(errors="replace"),
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
@contextmanager
|
|
333
|
+
def _wrap_boto_errors(self) -> Iterator[None]:
|
|
334
|
+
try:
|
|
335
|
+
yield
|
|
336
|
+
except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e:
|
|
337
|
+
raise LogStorageError(f"CloudWatch Logs error: {type(e).__name__}: {e}") from e
|
|
338
|
+
|
|
339
|
+
def _is_resource_not_found_exception(self, exc: "botocore.exceptions.ClientError") -> bool:
|
|
340
|
+
try:
|
|
341
|
+
return exc.response["Error"]["Code"] == "ResourceNotFoundException"
|
|
342
|
+
except KeyError:
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
def _check_group_exists(self, name: str) -> None:
|
|
346
|
+
try:
|
|
347
|
+
self._client.describe_log_streams(logGroupName=name, limit=1)
|
|
348
|
+
except botocore.exceptions.ClientError as e:
|
|
349
|
+
if self._is_resource_not_found_exception(e):
|
|
350
|
+
raise LogStorageError(f"LogGroup '{name}' does not exist")
|
|
351
|
+
raise
|
|
352
|
+
|
|
353
|
+
def _ensure_stream_exists(self, name: str, *, force: bool = False) -> None:
|
|
354
|
+
if not force and name in self._streams:
|
|
359
355
|
return
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
356
|
+
response = self._client.describe_log_streams(
|
|
357
|
+
logGroupName=self._group, logStreamNamePrefix=name
|
|
358
|
+
)
|
|
359
|
+
for stream in response["logStreams"]:
|
|
360
|
+
if stream["logStreamName"] == name:
|
|
361
|
+
self._streams.add(name)
|
|
362
|
+
return
|
|
363
|
+
self._client.create_log_stream(logGroupName=self._group, logStreamName=name)
|
|
364
|
+
self._streams.add(name)
|
|
365
|
+
|
|
366
|
+
def _get_stream_name(
|
|
367
|
+
self,
|
|
368
|
+
project_name: str,
|
|
369
|
+
run_name: str,
|
|
370
|
+
job_submission_id: UUID,
|
|
371
|
+
producer: LogProducer,
|
|
372
|
+
) -> str:
|
|
373
|
+
return f"{project_name}/{run_name}/{job_submission_id}/{producer.value}"
|