eodag 3.6.0__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eodag/api/core.py +0 -14
- eodag/api/product/metadata_mapping.py +20 -3
- eodag/cli.py +6 -3
- eodag/config.py +5 -0
- eodag/plugins/authentication/openid_connect.py +1 -2
- eodag/plugins/download/aws.py +145 -178
- eodag/plugins/download/base.py +3 -2
- eodag/plugins/download/creodias_s3.py +10 -5
- eodag/plugins/download/http.py +14 -6
- eodag/plugins/download/s3rest.py +1 -2
- eodag/plugins/manager.py +1 -1
- eodag/plugins/search/base.py +34 -4
- eodag/plugins/search/build_search_result.py +3 -0
- eodag/plugins/search/cop_marine.py +2 -0
- eodag/plugins/search/qssearch.py +44 -25
- eodag/resources/ext_product_types.json +1 -1
- eodag/resources/product_types.yml +30 -153
- eodag/resources/providers.yml +48 -325
- eodag/resources/stac.yml +1 -2
- eodag/resources/user_conf_template.yml +0 -11
- eodag/rest/core.py +5 -9
- eodag/utils/__init__.py +41 -27
- eodag/utils/exceptions.py +4 -0
- eodag/utils/s3.py +605 -65
- {eodag-3.6.0.dist-info → eodag-3.7.0.dist-info}/METADATA +7 -8
- {eodag-3.6.0.dist-info → eodag-3.7.0.dist-info}/RECORD +30 -30
- {eodag-3.6.0.dist-info → eodag-3.7.0.dist-info}/WHEEL +0 -0
- {eodag-3.6.0.dist-info → eodag-3.7.0.dist-info}/entry_points.txt +0 -0
- {eodag-3.6.0.dist-info → eodag-3.7.0.dist-info}/licenses/LICENSE +0 -0
- {eodag-3.6.0.dist-info → eodag-3.7.0.dist-info}/top_level.txt +0 -0
eodag/utils/s3.py
CHANGED
|
@@ -20,118 +20,458 @@ from __future__ import annotations
|
|
|
20
20
|
import io
|
|
21
21
|
import logging
|
|
22
22
|
import os
|
|
23
|
-
import
|
|
24
|
-
from
|
|
23
|
+
import uuid
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from typing import TYPE_CHECKING
|
|
25
27
|
from urllib.parse import urlparse
|
|
28
|
+
from zipfile import ZIP_STORED, ZipFile
|
|
26
29
|
|
|
27
30
|
import boto3
|
|
28
31
|
import botocore
|
|
32
|
+
import botocore.exceptions
|
|
33
|
+
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
|
34
|
+
from stream_zip import ZIP_AUTO, stream_zip
|
|
29
35
|
|
|
30
36
|
from eodag.plugins.authentication.aws_auth import AwsAuth
|
|
31
|
-
from eodag.utils import
|
|
37
|
+
from eodag.utils import (
|
|
38
|
+
StreamResponse,
|
|
39
|
+
get_bucket_name_and_prefix,
|
|
40
|
+
guess_file_type,
|
|
41
|
+
parse_le_uint16,
|
|
42
|
+
parse_le_uint32,
|
|
43
|
+
)
|
|
32
44
|
from eodag.utils.exceptions import (
|
|
33
45
|
AuthenticationError,
|
|
46
|
+
DownloadError,
|
|
47
|
+
InvalidDataError,
|
|
34
48
|
MisconfiguredError,
|
|
35
49
|
NotAvailableError,
|
|
36
50
|
)
|
|
37
51
|
|
|
38
52
|
if TYPE_CHECKING:
|
|
39
|
-
from
|
|
53
|
+
from typing import Iterable, Iterator, Literal, Optional
|
|
54
|
+
from zipfile import ZipInfo
|
|
40
55
|
|
|
41
56
|
from mypy_boto3_s3.client import S3Client
|
|
57
|
+
from stream_zip import Method
|
|
42
58
|
|
|
43
59
|
from eodag.api.product import EOProduct # type: ignore
|
|
44
60
|
|
|
45
61
|
logger = logging.getLogger("eodag.utils.s3")
|
|
46
62
|
|
|
63
|
+
MIME_OCTET_STREAM = "application/octet-stream"
|
|
64
|
+
|
|
47
65
|
|
|
48
|
-
def
|
|
49
|
-
bucket_name: str, key_name: str, start: int,
|
|
66
|
+
def fetch_range(
|
|
67
|
+
bucket_name: str, key_name: str, start: int, end: int, client_s3: S3Client
|
|
50
68
|
) -> bytes:
|
|
51
69
|
"""
|
|
52
70
|
Range-fetches a S3 key.
|
|
53
71
|
|
|
54
72
|
:param bucket_name: Bucket name of the object to fetch
|
|
55
73
|
:param key_name: Key name of the object to fetch
|
|
56
|
-
:param start:
|
|
57
|
-
:param
|
|
74
|
+
:param start: Start byte position to fetch
|
|
75
|
+
:param end: End byte position to fetch
|
|
58
76
|
:param client_s3: s3 client used to fetch the object
|
|
59
77
|
:returns: Object bytes
|
|
60
78
|
"""
|
|
61
|
-
|
|
62
|
-
s3_object = client_s3.get_object(
|
|
79
|
+
response = client_s3.get_object(
|
|
63
80
|
Bucket=bucket_name, Key=key_name, Range="bytes=%d-%d" % (start, end)
|
|
64
81
|
)
|
|
65
|
-
return
|
|
82
|
+
return response["Body"].read()
|
|
66
83
|
|
|
67
84
|
|
|
68
|
-
|
|
85
|
+
@dataclass
|
|
86
|
+
class S3FileInfo:
|
|
69
87
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
:param bytes: bytes to parse
|
|
73
|
-
:returns: parsed int
|
|
88
|
+
Describe a S3 object with basic f_info and its download state.
|
|
74
89
|
"""
|
|
75
|
-
val = (bytes[0]) + ((bytes[1]) << 8)
|
|
76
|
-
if len(bytes) > 3:
|
|
77
|
-
val += ((bytes[2]) << 16) + ((bytes[3]) << 24)
|
|
78
|
-
return val
|
|
79
90
|
|
|
91
|
+
size: int
|
|
92
|
+
key: str
|
|
93
|
+
bucket_name: str
|
|
94
|
+
#: Path inside the ZIP archive if the file is stored inside a ZIP.
|
|
95
|
+
zip_filepath: Optional[str] = None
|
|
96
|
+
#: Offset in the ZIP archive where the file data starts.
|
|
97
|
+
data_start_offset: int = 0
|
|
98
|
+
#: MIME type of the file, defaulting to application/octet-stream.
|
|
99
|
+
#: It can be updated based on the file extension or content type.
|
|
100
|
+
data_type: str = MIME_OCTET_STREAM
|
|
101
|
+
#: Relative path of the file, if applicable (e.g., inside a ZIP archive).
|
|
102
|
+
rel_path: Optional[str] = None
|
|
103
|
+
|
|
104
|
+
# These fields hold the state for downloading
|
|
105
|
+
#: Offset in the logical (global) file stream where this file starts.
|
|
106
|
+
file_start_offset: int = 0
|
|
107
|
+
#: Mapping of futures to their start byte offsets, used to track download progress.
|
|
108
|
+
#: Each future corresponds to a chunk of data being downloaded.
|
|
109
|
+
#: The key is the future object, and the value is the start byte offset of that
|
|
110
|
+
#: chunk in the logical file stream.
|
|
111
|
+
futures: dict = field(default_factory=dict)
|
|
112
|
+
#: Buffers for downloaded data chunks, mapping start byte offsets to the actual data.
|
|
113
|
+
#: This allows for partial downloads and efficient memory usage.
|
|
114
|
+
#: The key is the start byte offset, and the value is the bytes data for that
|
|
115
|
+
#: offset. This is used to yield data in the correct order during streaming.
|
|
116
|
+
#: It is updated as chunks are downloaded.
|
|
117
|
+
buffers: dict[int, bytes] = field(default_factory=dict)
|
|
118
|
+
#: The next offset to yield in the file, used to track progress during downloading
|
|
119
|
+
#: and yielding chunks. It starts at 0 and is updated as data is yielded.
|
|
120
|
+
#: This allows the streaming process to continue from where it left off,
|
|
121
|
+
#: ensuring that all data is eventually yielded without duplication.
|
|
122
|
+
next_yield: int = 0
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _prepare_file_in_zip(f_info: S3FileInfo, s3_client: S3Client):
|
|
126
|
+
"""Update file information with the offset and size of the file inside the zip archive"""
|
|
127
|
+
|
|
128
|
+
splitted_path = f_info.key.split(".zip!")
|
|
129
|
+
f_info.key = f"{splitted_path[0]}.zip"
|
|
130
|
+
f_info.zip_filepath = splitted_path[-1] # file path inside the ZIP archive
|
|
131
|
+
|
|
132
|
+
f_info.data_start_offset, f_info.size = file_position_from_s3_zip(
|
|
133
|
+
f_info.bucket_name,
|
|
134
|
+
f_info.key,
|
|
135
|
+
s3_client,
|
|
136
|
+
f_info.zip_filepath,
|
|
137
|
+
)
|
|
80
138
|
|
|
81
|
-
def open_s3_zipped_object(
|
|
82
|
-
bucket_name: str, key_name: str, client_s3: S3Client, partial: bool = True
|
|
83
|
-
) -> ZipFile:
|
|
84
|
-
"""
|
|
85
|
-
Open s3 zipped object, without downloading it.
|
|
86
|
-
|
|
87
|
-
See https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it;
|
|
88
|
-
Based on https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file
|
|
89
139
|
|
|
90
|
-
|
|
91
|
-
:
|
|
92
|
-
:
|
|
93
|
-
:
|
|
94
|
-
|
|
140
|
+
def _compute_file_ranges(
|
|
141
|
+
file_info: S3FileInfo,
|
|
142
|
+
byte_range: tuple[Optional[int], Optional[int]],
|
|
143
|
+
range_size: int,
|
|
144
|
+
) -> Optional[list[tuple[int, int]]]:
|
|
95
145
|
"""
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
146
|
+
Compute the byte ranges to download for a single file, considering the overall requested range.
|
|
147
|
+
|
|
148
|
+
This function calculates which byte ranges within the given file should be downloaded,
|
|
149
|
+
based on the global requested byte range (`byte_range`) and the size of each chunk (`range_size`).
|
|
150
|
+
It accounts for possible offsets if the file is part of a ZIP archive or not aligned at offset zero.
|
|
151
|
+
|
|
152
|
+
:param file_info: The S3FileInfo object containing file metadata, including size, data offset,
|
|
153
|
+
and its starting offset in the full logical file stream.
|
|
154
|
+
:param byte_range: A tuple (start, end) specifying the requested global byte range, where either
|
|
155
|
+
value may be None to indicate an open-ended range.
|
|
156
|
+
:param range_size: The size of each download chunk in bytes.
|
|
157
|
+
:returns: A list of (start, end) tuples indicating byte ranges to download within this file,
|
|
158
|
+
or None if the file lies completely outside the requested range.
|
|
159
|
+
"""
|
|
160
|
+
file_size = file_info.size
|
|
161
|
+
file_start_offset = file_info.file_start_offset
|
|
162
|
+
file_end_offset = file_start_offset + file_size - 1
|
|
163
|
+
|
|
164
|
+
range_start, range_end = byte_range
|
|
165
|
+
|
|
166
|
+
# Check if the file overlaps with the requested range
|
|
167
|
+
if range_end is not None and range_start is not None:
|
|
168
|
+
if file_end_offset < range_start or file_start_offset > range_end:
|
|
169
|
+
return None # No overlap, skip this file
|
|
170
|
+
|
|
171
|
+
start = 0
|
|
172
|
+
end = file_size - 1
|
|
173
|
+
|
|
174
|
+
# Adjust start and end based on the requested range
|
|
175
|
+
if range_start is not None:
|
|
176
|
+
start = max(0, range_start - file_start_offset)
|
|
177
|
+
if range_end is not None:
|
|
178
|
+
end = min(file_size - 1, range_end - file_start_offset)
|
|
179
|
+
|
|
180
|
+
start += file_info.data_start_offset
|
|
181
|
+
end += file_info.data_start_offset
|
|
182
|
+
|
|
183
|
+
# Compute the ranges in chunks
|
|
184
|
+
ranges = []
|
|
185
|
+
for chunk_start in range(start, end + 1, range_size):
|
|
186
|
+
chunk_end = min(chunk_start + range_size - 1, end)
|
|
187
|
+
ranges.append((chunk_start, chunk_end))
|
|
188
|
+
|
|
189
|
+
return ranges
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _chunks_from_s3_objects(
|
|
193
|
+
s3_client: S3Client,
|
|
194
|
+
files_info: list[S3FileInfo],
|
|
195
|
+
byte_range: tuple[Optional[int], Optional[int]],
|
|
196
|
+
range_size: int,
|
|
197
|
+
executor: ThreadPoolExecutor,
|
|
198
|
+
) -> Iterator[tuple[int, Iterator[bytes]]]:
|
|
199
|
+
"""Download chunks from S3 objects in parallel, respecting byte ranges and file order."""
|
|
200
|
+
for f_info in files_info:
|
|
201
|
+
ranges = _compute_file_ranges(f_info, byte_range, range_size)
|
|
202
|
+
|
|
203
|
+
if not ranges:
|
|
204
|
+
logger.debug("Skipping %s: no ranges to fetch", f_info.key)
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
f_info.buffers = {}
|
|
208
|
+
f_info.next_yield = 0
|
|
209
|
+
|
|
210
|
+
futures = {}
|
|
211
|
+
for start, length in ranges:
|
|
212
|
+
future = executor.submit(
|
|
213
|
+
fetch_range,
|
|
214
|
+
f_info.bucket_name,
|
|
215
|
+
f_info.key,
|
|
216
|
+
start,
|
|
217
|
+
length,
|
|
218
|
+
s3_client,
|
|
219
|
+
)
|
|
220
|
+
futures[future] = start
|
|
221
|
+
|
|
222
|
+
f_info.futures = futures
|
|
223
|
+
|
|
224
|
+
# Combine all futures to wait on globally
|
|
225
|
+
all_futures = {
|
|
226
|
+
fut: (f_info, start)
|
|
227
|
+
for f_info in files_info
|
|
228
|
+
for fut, start in f_info.futures.items()
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
current_file_index = 0
|
|
232
|
+
|
|
233
|
+
# Yield chunks per file (one at a time)
|
|
234
|
+
while current_file_index < len(files_info):
|
|
235
|
+
current_info = files_info[current_file_index]
|
|
236
|
+
|
|
237
|
+
def chunks_generator() -> Iterator[bytes]:
|
|
238
|
+
"""yield chunks of data for the current file."""
|
|
239
|
+
nonlocal current_file_index, all_futures
|
|
240
|
+
while current_info.next_yield < current_info.size:
|
|
241
|
+
# Wait for any futures to complete
|
|
242
|
+
done, _ = wait(all_futures.keys(), return_when=FIRST_COMPLETED)
|
|
243
|
+
|
|
244
|
+
for fut in done:
|
|
245
|
+
f_info, start = all_futures.pop(fut)
|
|
246
|
+
data = fut.result()
|
|
247
|
+
f_info.buffers[start] = data
|
|
248
|
+
|
|
249
|
+
# Yield chunks as they are available
|
|
250
|
+
next_start = current_info.next_yield
|
|
251
|
+
while next_start in current_info.buffers:
|
|
252
|
+
chunk = current_info.buffers.pop(next_start)
|
|
253
|
+
if not isinstance(chunk, bytes):
|
|
254
|
+
raise InvalidDataError(
|
|
255
|
+
f"Expected bytes, got {type(chunk).__name__} in stream chunks: {chunk}"
|
|
256
|
+
)
|
|
257
|
+
yield chunk
|
|
258
|
+
|
|
259
|
+
next_start += range_size
|
|
260
|
+
current_info.next_yield = next_start
|
|
261
|
+
|
|
262
|
+
# If done with this file, stop yielding chunks for this file
|
|
263
|
+
if current_info.next_yield >= current_info.size:
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
yield current_file_index, chunks_generator()
|
|
267
|
+
|
|
268
|
+
current_file_index += 1
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _build_stream_response(
|
|
272
|
+
zip_filename: str,
|
|
273
|
+
files_info: list[S3FileInfo],
|
|
274
|
+
files_iterator: Iterator[tuple[int, Iterator[bytes]]],
|
|
275
|
+
compress: Literal["zip", "raw", "auto"],
|
|
276
|
+
executor: ThreadPoolExecutor,
|
|
277
|
+
) -> StreamResponse:
|
|
278
|
+
"""
|
|
279
|
+
Build a streaming HTTP response for one or multiple files from S3, supporting ZIP, raw, and multipart formats.
|
|
108
280
|
|
|
109
|
-
|
|
110
|
-
cd + eocd if partial else fetch(bucket_name, key_name, 0, size, client_s3)
|
|
111
|
-
)
|
|
281
|
+
The response format depends on the `compress` parameter and the number of files:
|
|
112
282
|
|
|
113
|
-
zip
|
|
283
|
+
- If `compress` is "zip" or "auto" with multiple files, returns a ZIP archive containing all files.
|
|
284
|
+
- If `compress` is "raw" and multiple files, returns a multipart/mixed response with each file as a part.
|
|
285
|
+
- If only one file is present and `compress` is "raw" or "auto", streams the file directly with its MIME type.
|
|
114
286
|
|
|
115
|
-
|
|
287
|
+
Response formats:
|
|
116
288
|
|
|
289
|
+
- ZIP archive (Content-Type: application/zip) with Content-Disposition for download.
|
|
290
|
+
- Multipart/mixed (Content-Type: multipart/mixed; boundary=...) with each file as a part.
|
|
291
|
+
- Single raw file stream with its MIME type and Content-Disposition for download.
|
|
117
292
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
)
|
|
293
|
+
:param zip_filename: Base filename to use for the ZIP archive (without extension).
|
|
294
|
+
:param files_info: List of S3FileInfo objects describing each file (metadata, MIME type, etc.).
|
|
295
|
+
:param files_iterator: Iterator yielding (file_index, chunk_iterator) for streaming file contents.
|
|
296
|
+
:param compress: Output format:
|
|
297
|
+
- "zip": Always produce a ZIP archive.
|
|
298
|
+
- "raw": Stream files directly, as a single file or multipart.
|
|
299
|
+
- "auto": ZIP if multiple files, raw if single file.
|
|
300
|
+
:param executor: Executor used for concurrent streaming and cleanup.
|
|
301
|
+
:return: Streaming HTTP response with appropriate content, headers, and media type.
|
|
121
302
|
"""
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
303
|
+
headers = {
|
|
304
|
+
"Accept-Ranges": "bytes",
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
def _wrap_generator_with_cleanup(
|
|
308
|
+
generator: Iterable[bytes], executor: ThreadPoolExecutor
|
|
309
|
+
) -> Iterator[bytes]:
|
|
310
|
+
try:
|
|
311
|
+
yield from generator
|
|
312
|
+
finally:
|
|
313
|
+
executor.shutdown(wait=True)
|
|
314
|
+
|
|
315
|
+
def _build_response(
|
|
316
|
+
content_gen: Iterable[bytes],
|
|
317
|
+
media_type: str,
|
|
318
|
+
extra_headers: dict[str, str] = {},
|
|
319
|
+
) -> StreamResponse:
|
|
320
|
+
return StreamResponse(
|
|
321
|
+
content=_wrap_generator_with_cleanup(content_gen, executor),
|
|
322
|
+
media_type=media_type,
|
|
323
|
+
headers={**headers, **extra_headers},
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
zip_response = (len(files_info) > 1 and compress == "auto") or compress == "zip"
|
|
327
|
+
|
|
328
|
+
if zip_response:
|
|
329
|
+
modified_at = datetime.now()
|
|
330
|
+
perms = 0o600
|
|
331
|
+
total_file_size = sum(f.size for f in files_info)
|
|
332
|
+
|
|
333
|
+
def zip_stream() -> Iterator[
|
|
334
|
+
tuple[str, datetime, int, Method, Iterable[bytes]]
|
|
335
|
+
]:
|
|
336
|
+
for index, chunks_generator in files_iterator:
|
|
337
|
+
yield (
|
|
338
|
+
files_info[index].rel_path or files_info[index].key,
|
|
339
|
+
modified_at,
|
|
340
|
+
perms,
|
|
341
|
+
ZIP_AUTO(total_file_size, level=0),
|
|
342
|
+
chunks_generator,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return _build_response(
|
|
346
|
+
content_gen=stream_zip(zip_stream()),
|
|
347
|
+
media_type="application/zip",
|
|
348
|
+
extra_headers={
|
|
349
|
+
"content-disposition": f'attachment; filename="{zip_filename}.zip"'
|
|
350
|
+
},
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
elif len(files_info) > 1:
|
|
354
|
+
boundary = uuid.uuid4().hex
|
|
355
|
+
|
|
356
|
+
def multipart_stream():
|
|
357
|
+
current_index = -1
|
|
358
|
+
for index, chunks_generator in files_iterator:
|
|
359
|
+
if index != current_index:
|
|
360
|
+
if current_index != -1:
|
|
361
|
+
yield b"\r\n"
|
|
362
|
+
filename = os.path.basename(files_info[index].key)
|
|
363
|
+
yield (
|
|
364
|
+
f"--{boundary}\r\n"
|
|
365
|
+
f'Content-Disposition: attachment; filename="{filename}"\r\n'
|
|
366
|
+
f"Content-Type: {files_info[index].data_type}\r\n\r\n"
|
|
367
|
+
).encode()
|
|
368
|
+
current_index = index
|
|
369
|
+
yield from chunks_generator
|
|
370
|
+
yield f"\r\n--{boundary}--\r\n".encode()
|
|
371
|
+
|
|
372
|
+
return _build_response(
|
|
373
|
+
content_gen=multipart_stream(),
|
|
374
|
+
media_type=f"multipart/mixed; boundary={boundary}",
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
else:
|
|
378
|
+
index, chunks_generator = next(files_iterator)
|
|
379
|
+
first_chunk = next(chunks_generator)
|
|
380
|
+
filename = os.path.basename(files_info[index].key)
|
|
381
|
+
|
|
382
|
+
def single_file_stream() -> Iterator[bytes]:
|
|
383
|
+
yield first_chunk
|
|
384
|
+
yield from chunks_generator
|
|
385
|
+
|
|
386
|
+
return _build_response(
|
|
387
|
+
content_gen=single_file_stream(),
|
|
388
|
+
media_type=files_info[index].data_type,
|
|
389
|
+
extra_headers={"content-disposition": f'attachment; filename="{filename}"'},
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def stream_download_from_s3(
|
|
394
|
+
s3_client: S3Client,
|
|
395
|
+
files_info: list[S3FileInfo],
|
|
396
|
+
byte_range: tuple[Optional[int], Optional[int]] = (None, None),
|
|
397
|
+
compress: Literal["zip", "raw", "auto"] = "auto",
|
|
398
|
+
zip_filename: str = "archive",
|
|
399
|
+
range_size: int = 1024**2 * 8,
|
|
400
|
+
max_workers: int = 8,
|
|
401
|
+
) -> StreamResponse:
|
|
131
402
|
"""
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
403
|
+
Stream data from one or more S3 objects in chunks, with support for global byte ranges
|
|
404
|
+
and partial file extraction from ZIP archives.
|
|
405
|
+
|
|
406
|
+
This function downloads product data from S3 using concurrent range requests across one or
|
|
407
|
+
multiple files. It divides the requested data into chunks (default: 8 MiB) and issues
|
|
408
|
+
parallel HTTP range requests to optimize download throughput. This is particularly useful
|
|
409
|
+
for large files or datasets stored across multiple S3 objects.
|
|
410
|
+
|
|
411
|
+
If the S3 key refers to a path inside a ``.zip`` file (denoted by ``.zip!<internal_path>``),
|
|
412
|
+
the function extracts the specified file from the archive only if it is stored uncompressed
|
|
413
|
+
(ZIP method = STORE). Compressed formats (like DEFLATE) are not supported for partial ZIP extraction.
|
|
414
|
+
|
|
415
|
+
The function supports global byte range filtering via the ``byte_range`` parameter, which allows
|
|
416
|
+
requesting only a specific portion of the logical file stream across all provided objects.
|
|
417
|
+
|
|
418
|
+
Downloads are performed concurrently using a thread pool and HTTP range requests. Each chunk is downloaded
|
|
419
|
+
as a separate HTTP request and yielded in file order.
|
|
420
|
+
|
|
421
|
+
The ``compress`` parameter determines the output format:
|
|
422
|
+
|
|
423
|
+
- ``zip``: Always produce a ZIP archive containing all files.
|
|
424
|
+
- ``raw``: Stream files directly without wrapping, either as a single file or multipart response.
|
|
425
|
+
- ``auto``: Automatically select the format:
|
|
426
|
+
- raw stream if only a single file is requested
|
|
427
|
+
- ZIP archive if multiple files are requested
|
|
428
|
+
|
|
429
|
+
:param s3_client: A configured S3 client capable of making range requests.
|
|
430
|
+
:param files_info: List of S3FileInfo objects representing the files to download.
|
|
431
|
+
:param byte_range: Tuple (start, end) defining the inclusive global byte range to download across all objects.
|
|
432
|
+
Either value can be None to indicate open-ended range.
|
|
433
|
+
:param compress: Determines the output format of the streamed response.
|
|
434
|
+
:param zip_filename: The base filename to use when producing a ZIP archive (without extension).
|
|
435
|
+
:param range_size: The size in bytes of each download chunk. Defaults to 8 MiB.
|
|
436
|
+
:param max_workers: The maximum number of concurrent download tasks. Controls the size of the thread pool.
|
|
437
|
+
:return: Streaming HTTP response with content according to the requested format.
|
|
438
|
+
:raises DownloadError: If any error occurs during streaming from S3, including missing files or
|
|
439
|
+
unsupported ZIP compression.
|
|
440
|
+
"""
|
|
441
|
+
offset = 0
|
|
442
|
+
|
|
443
|
+
executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
444
|
+
try:
|
|
445
|
+
for f_info in files_info:
|
|
446
|
+
# Check if file is inside a ZIP
|
|
447
|
+
if ".zip!" in f_info.key:
|
|
448
|
+
future = executor.submit(_prepare_file_in_zip, f_info, s3_client)
|
|
449
|
+
f_info.futures[future] = 0
|
|
450
|
+
|
|
451
|
+
for f_info in files_info:
|
|
452
|
+
for future in f_info.futures:
|
|
453
|
+
future.result()
|
|
454
|
+
f_info.file_start_offset = offset
|
|
455
|
+
offset += f_info.size
|
|
456
|
+
|
|
457
|
+
if not f_info.data_type or f_info.data_type == MIME_OCTET_STREAM:
|
|
458
|
+
guessed = guess_file_type(f_info.key)
|
|
459
|
+
f_info.data_type = guessed or MIME_OCTET_STREAM
|
|
460
|
+
|
|
461
|
+
chunks_tuple = _chunks_from_s3_objects(
|
|
462
|
+
s3_client,
|
|
463
|
+
files_info,
|
|
464
|
+
byte_range,
|
|
465
|
+
range_size,
|
|
466
|
+
executor,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
return _build_stream_response(
|
|
470
|
+
zip_filename, files_info, chunks_tuple, compress, executor
|
|
471
|
+
)
|
|
472
|
+
except Exception as e:
|
|
473
|
+
executor.shutdown(wait=True)
|
|
474
|
+
raise DownloadError(str(e)) from e
|
|
135
475
|
|
|
136
476
|
|
|
137
477
|
def update_assets_from_s3(
|
|
@@ -229,3 +569,203 @@ def update_assets_from_s3(
|
|
|
229
569
|
raise NotAvailableError(
|
|
230
570
|
f"assets for product {prefix} could not be found"
|
|
231
571
|
) from e
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
# ----- ZIP section -----
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def open_s3_zipped_object(
|
|
578
|
+
bucket_name: str,
|
|
579
|
+
key_name: str,
|
|
580
|
+
s3_client,
|
|
581
|
+
zip_size: Optional[int] = None,
|
|
582
|
+
partial: bool = True,
|
|
583
|
+
) -> tuple[ZipFile, bytes]:
|
|
584
|
+
"""
|
|
585
|
+
Fetches the central directory and EOCD (End Of Central Directory) from an S3 object and opens a ZipFile in memory.
|
|
586
|
+
|
|
587
|
+
This function retrieves the ZIP file's central directory and EOCD by performing range requests on the S3 object.
|
|
588
|
+
It supports partial fetching (only the central directory and EOCD) for efficiency, or full ZIP download if needed.
|
|
589
|
+
|
|
590
|
+
:param bucket_name: Name of the S3 bucket containing the ZIP file.
|
|
591
|
+
:param key_name: Key (path) of the ZIP file in the S3 bucket.
|
|
592
|
+
:param s3_client: S3 client instance used to perform range requests.
|
|
593
|
+
:param zip_size: Size of the ZIP file in bytes. If None, it will be determined via a HEAD request.
|
|
594
|
+
:param partial: If True, only fetch the central directory and EOCD. If False, fetch the entire ZIP file.
|
|
595
|
+
:return: Tuple containing the opened ZipFile object and the central directory bytes.
|
|
596
|
+
:raises InvalidDataError: If the EOCD signature is not found in the last 64KB of the file.
|
|
597
|
+
"""
|
|
598
|
+
# EOCD is at least 22 bytes, but can be longer if ZIP comment exists.
|
|
599
|
+
# For simplicity, we fetch last 64KB max (max EOCD + comment length allowed by ZIP spec)
|
|
600
|
+
if zip_size is None:
|
|
601
|
+
response = s3_client.head_object(Bucket=bucket_name, Key=key_name)
|
|
602
|
+
zip_size = int(response["ContentLength"])
|
|
603
|
+
|
|
604
|
+
fetch_size = min(65536, zip_size)
|
|
605
|
+
eocd_search = fetch_range(
|
|
606
|
+
bucket_name, key_name, zip_size - fetch_size, zip_size - 1, s3_client
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Find EOCD signature from end: 0x06054b50 (little endian)
|
|
610
|
+
eocd_signature = b"\x50\x4b\x05\x06"
|
|
611
|
+
eocd_offset = eocd_search.rfind(eocd_signature)
|
|
612
|
+
if eocd_offset == -1:
|
|
613
|
+
raise InvalidDataError("EOCD signature not found in last 64KB of the file.")
|
|
614
|
+
|
|
615
|
+
eocd = eocd_search[eocd_offset : eocd_offset + 22]
|
|
616
|
+
|
|
617
|
+
cd_size = parse_le_uint32(eocd[12:16])
|
|
618
|
+
cd_start = parse_le_uint32(eocd[16:20])
|
|
619
|
+
|
|
620
|
+
# Fetch central directory
|
|
621
|
+
cd_data = fetch_range(
|
|
622
|
+
bucket_name, key_name, cd_start, cd_start + cd_size - 1, s3_client
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
zip_data = (
|
|
626
|
+
cd_data + eocd
|
|
627
|
+
if partial
|
|
628
|
+
else fetch_range(bucket_name, key_name, 0, zip_size - 1, s3_client)
|
|
629
|
+
)
|
|
630
|
+
zipf = ZipFile(io.BytesIO(zip_data))
|
|
631
|
+
return zipf, cd_data
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def _parse_central_directory_entry(cd_data: bytes, offset: int) -> dict[str, int]:
|
|
635
|
+
"""
|
|
636
|
+
Parse one central directory file header entry starting at offset.
|
|
637
|
+
Returns dict with relative local header offset and sizes.
|
|
638
|
+
"""
|
|
639
|
+
signature = cd_data[offset : offset + 4]
|
|
640
|
+
if signature != b"PK\x01\x02":
|
|
641
|
+
raise InvalidDataError("Bad central directory file header signature")
|
|
642
|
+
|
|
643
|
+
filename_len = parse_le_uint16(cd_data[offset + 28 : offset + 30])
|
|
644
|
+
extra_len = parse_le_uint16(cd_data[offset + 30 : offset + 32])
|
|
645
|
+
comment_len = parse_le_uint16(cd_data[offset + 32 : offset + 34])
|
|
646
|
+
|
|
647
|
+
relative_offset = parse_le_uint32(cd_data[offset + 42 : offset + 46])
|
|
648
|
+
|
|
649
|
+
header_size = 46 + filename_len + extra_len + comment_len
|
|
650
|
+
|
|
651
|
+
return {
|
|
652
|
+
"relative_offset": relative_offset,
|
|
653
|
+
"header_size": header_size,
|
|
654
|
+
"filename_len": filename_len,
|
|
655
|
+
"extra_len": extra_len,
|
|
656
|
+
"comment_len": comment_len,
|
|
657
|
+
"total_size": header_size,
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def _parse_local_file_header(local_header_bytes: bytes) -> int:
|
|
662
|
+
"""
|
|
663
|
+
Parse local file header to find total header size:
|
|
664
|
+
fixed 30 bytes + filename length + extra field length
|
|
665
|
+
"""
|
|
666
|
+
if local_header_bytes[0:4] != b"PK\x03\x04":
|
|
667
|
+
raise InvalidDataError("Bad local file header signature")
|
|
668
|
+
|
|
669
|
+
filename_len = parse_le_uint16(local_header_bytes[26:28])
|
|
670
|
+
extra_len = parse_le_uint16(local_header_bytes[28:30])
|
|
671
|
+
total_size = 30 + filename_len + extra_len
|
|
672
|
+
return total_size
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def file_position_from_s3_zip(
|
|
676
|
+
s3_bucket: str,
|
|
677
|
+
object_key: str,
|
|
678
|
+
s3_client,
|
|
679
|
+
target_filepath: str,
|
|
680
|
+
) -> tuple[int, int]:
|
|
681
|
+
"""
|
|
682
|
+
Get the start position and size of a specific file inside a ZIP archive stored in S3.
|
|
683
|
+
This function assumes the file is uncompressed (ZIP_STORED).
|
|
684
|
+
|
|
685
|
+
The returned tuple contains:
|
|
686
|
+
|
|
687
|
+
- **file_data_start**: The byte offset where the file data starts in the ZIP archive.
|
|
688
|
+
- **file_size**: The size of the file in bytes.
|
|
689
|
+
|
|
690
|
+
:param s3_bucket: The S3 bucket name.
|
|
691
|
+
:param object_key: The S3 object key for the ZIP file.
|
|
692
|
+
:param s3_client: The Boto3 S3 client.
|
|
693
|
+
:param target_filepath: The file path inside the ZIP archive to locate.
|
|
694
|
+
:return: A tuple (file_data_start, file_size)
|
|
695
|
+
:raises FileNotFoundError: If the target file is not found in the ZIP archive.
|
|
696
|
+
:raises NotImplementedError: If the file is not uncompressed (ZIP_STORED)
|
|
697
|
+
"""
|
|
698
|
+
zipf, cd_data = open_s3_zipped_object(s3_bucket, object_key, s3_client)
|
|
699
|
+
|
|
700
|
+
# Find file in zipf.filelist to get its index and f_info
|
|
701
|
+
target_info = None
|
|
702
|
+
cd_offset = 0
|
|
703
|
+
for fi in zipf.filelist:
|
|
704
|
+
if fi.filename == target_filepath:
|
|
705
|
+
target_info = fi
|
|
706
|
+
break
|
|
707
|
+
# 46 is the fixed size (in bytes) of the Central Directory File Header according to the ZIP spec
|
|
708
|
+
cd_entry_len = (
|
|
709
|
+
46 + len(fi.filename.encode("utf-8")) + len(fi.extra) + len(fi.comment)
|
|
710
|
+
)
|
|
711
|
+
cd_offset += cd_entry_len
|
|
712
|
+
|
|
713
|
+
zipf.close()
|
|
714
|
+
|
|
715
|
+
if target_info is None:
|
|
716
|
+
raise FileNotFoundError(f"File {target_filepath} not found in ZIP archive")
|
|
717
|
+
|
|
718
|
+
if target_info.compress_type != ZIP_STORED:
|
|
719
|
+
raise NotImplementedError("Only uncompressed files (ZIP_STORED) are supported.")
|
|
720
|
+
|
|
721
|
+
# Parse central directory entry to get relative local header offset
|
|
722
|
+
cd_entry = cd_data[
|
|
723
|
+
cd_offset : cd_offset
|
|
724
|
+
+ (
|
|
725
|
+
46
|
|
726
|
+
+ len(target_info.filename.encode("utf-8"))
|
|
727
|
+
+ len(target_info.extra)
|
|
728
|
+
+ len(target_info.comment)
|
|
729
|
+
)
|
|
730
|
+
]
|
|
731
|
+
cd_entry_info = _parse_central_directory_entry(cd_entry, 0)
|
|
732
|
+
|
|
733
|
+
local_header_offset = cd_entry_info["relative_offset"]
|
|
734
|
+
|
|
735
|
+
# Fetch local file header from S3 (at least 30 bytes + filename + extra field)
|
|
736
|
+
# We'll fetch 4 KB max to cover large filenames/extra fields safely
|
|
737
|
+
local_header_fetch_size = 4096
|
|
738
|
+
local_header_bytes = fetch_range(
|
|
739
|
+
s3_bucket,
|
|
740
|
+
object_key,
|
|
741
|
+
local_header_offset,
|
|
742
|
+
local_header_offset + local_header_fetch_size - 1,
|
|
743
|
+
s3_client,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
local_header_size = _parse_local_file_header(local_header_bytes)
|
|
747
|
+
|
|
748
|
+
# Calculate file data start and end offsets
|
|
749
|
+
file_data_start = local_header_offset + local_header_size
|
|
750
|
+
|
|
751
|
+
return file_data_start, target_info.file_size
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def list_files_in_s3_zipped_object(
|
|
755
|
+
bucket_name: str, key_name: str, s3_client: S3Client
|
|
756
|
+
) -> list[ZipInfo]:
|
|
757
|
+
"""
|
|
758
|
+
List files in s3 zipped object, without downloading it.
|
|
759
|
+
|
|
760
|
+
See https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it;
|
|
761
|
+
Based on https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file
|
|
762
|
+
|
|
763
|
+
:param bucket_name: Bucket name of the object to fetch
|
|
764
|
+
:param key_name: Key name of the object to fetch
|
|
765
|
+
:param s3_resource: s3 resource used to fetch the object
|
|
766
|
+
:returns: List of files in zip
|
|
767
|
+
"""
|
|
768
|
+
zip_file, _ = open_s3_zipped_object(bucket_name, key_name, s3_client)
|
|
769
|
+
with zip_file:
|
|
770
|
+
logger.debug("Found %s files in %s" % (len(zip_file.filelist), key_name))
|
|
771
|
+
return zip_file.filelist
|