eodag 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eodag/utils/s3.py CHANGED
@@ -20,118 +20,458 @@ from __future__ import annotations
20
20
  import io
21
21
  import logging
22
22
  import os
23
- import zipfile
24
- from typing import TYPE_CHECKING, List, Optional
23
+ import uuid
24
+ from dataclasses import dataclass, field
25
+ from datetime import datetime
26
+ from typing import TYPE_CHECKING
25
27
  from urllib.parse import urlparse
28
+ from zipfile import ZIP_STORED, ZipFile
26
29
 
27
30
  import boto3
28
31
  import botocore
32
+ import botocore.exceptions
33
+ from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
34
+ from stream_zip import ZIP_AUTO, stream_zip
29
35
 
30
36
  from eodag.plugins.authentication.aws_auth import AwsAuth
31
- from eodag.utils import get_bucket_name_and_prefix, guess_file_type
37
+ from eodag.utils import (
38
+ StreamResponse,
39
+ get_bucket_name_and_prefix,
40
+ guess_file_type,
41
+ parse_le_uint16,
42
+ parse_le_uint32,
43
+ )
32
44
  from eodag.utils.exceptions import (
33
45
  AuthenticationError,
46
+ DownloadError,
47
+ InvalidDataError,
34
48
  MisconfiguredError,
35
49
  NotAvailableError,
36
50
  )
37
51
 
38
52
  if TYPE_CHECKING:
39
- from zipfile import ZipFile, ZipInfo
53
+ from typing import Iterable, Iterator, Literal, Optional
54
+ from zipfile import ZipInfo
40
55
 
41
56
  from mypy_boto3_s3.client import S3Client
57
+ from stream_zip import Method
42
58
 
43
59
  from eodag.api.product import EOProduct # type: ignore
44
60
 
45
61
  logger = logging.getLogger("eodag.utils.s3")
46
62
 
63
+ MIME_OCTET_STREAM = "application/octet-stream"
64
+
47
65
 
48
- def fetch(
49
- bucket_name: str, key_name: str, start: int, len: int, client_s3: S3Client
66
+ def fetch_range(
67
+ bucket_name: str, key_name: str, start: int, end: int, client_s3: S3Client
50
68
  ) -> bytes:
51
69
  """
52
70
  Range-fetches a S3 key.
53
71
 
54
72
  :param bucket_name: Bucket name of the object to fetch
55
73
  :param key_name: Key name of the object to fetch
56
- :param start: Bucket name to fetch
57
- :param len: Bucket name to fetch
74
+ :param start: Start byte position to fetch
75
+ :param end: End byte position to fetch
58
76
  :param client_s3: s3 client used to fetch the object
59
77
  :returns: Object bytes
60
78
  """
61
- end = start + len - 1
62
- s3_object = client_s3.get_object(
79
+ response = client_s3.get_object(
63
80
  Bucket=bucket_name, Key=key_name, Range="bytes=%d-%d" % (start, end)
64
81
  )
65
- return s3_object["Body"].read()
82
+ return response["Body"].read()
66
83
 
67
84
 
68
- def parse_int(bytes: bytes) -> int:
85
+ @dataclass
86
+ class S3FileInfo:
69
87
  """
70
- Parses 2 or 4 little-endian bits into their corresponding integer value.
71
-
72
- :param bytes: bytes to parse
73
- :returns: parsed int
88
+ Describe a S3 object with basic f_info and its download state.
74
89
  """
75
- val = (bytes[0]) + ((bytes[1]) << 8)
76
- if len(bytes) > 3:
77
- val += ((bytes[2]) << 16) + ((bytes[3]) << 24)
78
- return val
79
90
 
91
+ size: int
92
+ key: str
93
+ bucket_name: str
94
+ #: Path inside the ZIP archive if the file is stored inside a ZIP.
95
+ zip_filepath: Optional[str] = None
96
+ #: Offset in the ZIP archive where the file data starts.
97
+ data_start_offset: int = 0
98
+ #: MIME type of the file, defaulting to application/octet-stream.
99
+ #: It can be updated based on the file extension or content type.
100
+ data_type: str = MIME_OCTET_STREAM
101
+ #: Relative path of the file, if applicable (e.g., inside a ZIP archive).
102
+ rel_path: Optional[str] = None
103
+
104
+ # These fields hold the state for downloading
105
+ #: Offset in the logical (global) file stream where this file starts.
106
+ file_start_offset: int = 0
107
+ #: Mapping of futures to their start byte offsets, used to track download progress.
108
+ #: Each future corresponds to a chunk of data being downloaded.
109
+ #: The key is the future object, and the value is the start byte offset of that
110
+ #: chunk in the logical file stream.
111
+ futures: dict = field(default_factory=dict)
112
+ #: Buffers for downloaded data chunks, mapping start byte offsets to the actual data.
113
+ #: This allows for partial downloads and efficient memory usage.
114
+ #: The key is the start byte offset, and the value is the bytes data for that
115
+ #: offset. This is used to yield data in the correct order during streaming.
116
+ #: It is updated as chunks are downloaded.
117
+ buffers: dict[int, bytes] = field(default_factory=dict)
118
+ #: The next offset to yield in the file, used to track progress during downloading
119
+ #: and yielding chunks. It starts at 0 and is updated as data is yielded.
120
+ #: This allows the streaming process to continue from where it left off,
121
+ #: ensuring that all data is eventually yielded without duplication.
122
+ next_yield: int = 0
123
+
124
+
125
+ def _prepare_file_in_zip(f_info: S3FileInfo, s3_client: S3Client):
126
+ """Update file information with the offset and size of the file inside the zip archive"""
127
+
128
+ splitted_path = f_info.key.split(".zip!")
129
+ f_info.key = f"{splitted_path[0]}.zip"
130
+ f_info.zip_filepath = splitted_path[-1] # file path inside the ZIP archive
131
+
132
+ f_info.data_start_offset, f_info.size = file_position_from_s3_zip(
133
+ f_info.bucket_name,
134
+ f_info.key,
135
+ s3_client,
136
+ f_info.zip_filepath,
137
+ )
80
138
 
81
- def open_s3_zipped_object(
82
- bucket_name: str, key_name: str, client_s3: S3Client, partial: bool = True
83
- ) -> ZipFile:
84
- """
85
- Open s3 zipped object, without downloading it.
86
-
87
- See https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it;
88
- Based on https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file
89
139
 
90
- :param bucket_name: Bucket name of the object to fetch
91
- :param key_name: Key name of the object to fetch
92
- :param client_s3: s3 client used to fetch the object
93
- :param partial: fetch partial data if only content info is needed
94
- :returns: List of files in zip
140
+ def _compute_file_ranges(
141
+ file_info: S3FileInfo,
142
+ byte_range: tuple[Optional[int], Optional[int]],
143
+ range_size: int,
144
+ ) -> Optional[list[tuple[int, int]]]:
95
145
  """
96
- response = client_s3.head_object(Bucket=bucket_name, Key=key_name)
97
- size = response["ContentLength"]
98
-
99
- # End Of Central Directory bytes
100
- eocd = fetch(bucket_name, key_name, size - 22, 22, client_s3)
101
-
102
- # start offset and size of the central directory
103
- cd_start = parse_int(eocd[16:20])
104
- cd_size = parse_int(eocd[12:16])
105
-
106
- # fetch central directory, append EOCD, and open as zipfile
107
- cd = fetch(bucket_name, key_name, cd_start, cd_size, client_s3)
146
+ Compute the byte ranges to download for a single file, considering the overall requested range.
147
+
148
+ This function calculates which byte ranges within the given file should be downloaded,
149
+ based on the global requested byte range (`byte_range`) and the size of each chunk (`range_size`).
150
+ It accounts for possible offsets if the file is part of a ZIP archive or not aligned at offset zero.
151
+
152
+ :param file_info: The S3FileInfo object containing file metadata, including size, data offset,
153
+ and its starting offset in the full logical file stream.
154
+ :param byte_range: A tuple (start, end) specifying the requested global byte range, where either
155
+ value may be None to indicate an open-ended range.
156
+ :param range_size: The size of each download chunk in bytes.
157
+ :returns: A list of (start, end) tuples indicating byte ranges to download within this file,
158
+ or None if the file lies completely outside the requested range.
159
+ """
160
+ file_size = file_info.size
161
+ file_start_offset = file_info.file_start_offset
162
+ file_end_offset = file_start_offset + file_size - 1
163
+
164
+ range_start, range_end = byte_range
165
+
166
+ # Check if the file overlaps with the requested range
167
+ if range_end is not None and range_start is not None:
168
+ if file_end_offset < range_start or file_start_offset > range_end:
169
+ return None # No overlap, skip this file
170
+
171
+ start = 0
172
+ end = file_size - 1
173
+
174
+ # Adjust start and end based on the requested range
175
+ if range_start is not None:
176
+ start = max(0, range_start - file_start_offset)
177
+ if range_end is not None:
178
+ end = min(file_size - 1, range_end - file_start_offset)
179
+
180
+ start += file_info.data_start_offset
181
+ end += file_info.data_start_offset
182
+
183
+ # Compute the ranges in chunks
184
+ ranges = []
185
+ for chunk_start in range(start, end + 1, range_size):
186
+ chunk_end = min(chunk_start + range_size - 1, end)
187
+ ranges.append((chunk_start, chunk_end))
188
+
189
+ return ranges
190
+
191
+
192
+ def _chunks_from_s3_objects(
193
+ s3_client: S3Client,
194
+ files_info: list[S3FileInfo],
195
+ byte_range: tuple[Optional[int], Optional[int]],
196
+ range_size: int,
197
+ executor: ThreadPoolExecutor,
198
+ ) -> Iterator[tuple[int, Iterator[bytes]]]:
199
+ """Download chunks from S3 objects in parallel, respecting byte ranges and file order."""
200
+ for f_info in files_info:
201
+ ranges = _compute_file_ranges(f_info, byte_range, range_size)
202
+
203
+ if not ranges:
204
+ logger.debug("Skipping %s: no ranges to fetch", f_info.key)
205
+ continue
206
+
207
+ f_info.buffers = {}
208
+ f_info.next_yield = 0
209
+
210
+ futures = {}
211
+ for start, length in ranges:
212
+ future = executor.submit(
213
+ fetch_range,
214
+ f_info.bucket_name,
215
+ f_info.key,
216
+ start,
217
+ length,
218
+ s3_client,
219
+ )
220
+ futures[future] = start
221
+
222
+ f_info.futures = futures
223
+
224
+ # Combine all futures to wait on globally
225
+ all_futures = {
226
+ fut: (f_info, start)
227
+ for f_info in files_info
228
+ for fut, start in f_info.futures.items()
229
+ }
230
+
231
+ current_file_index = 0
232
+
233
+ # Yield chunks per file (one at a time)
234
+ while current_file_index < len(files_info):
235
+ current_info = files_info[current_file_index]
236
+
237
+ def chunks_generator() -> Iterator[bytes]:
238
+ """yield chunks of data for the current file."""
239
+ nonlocal current_file_index, all_futures
240
+ while current_info.next_yield < current_info.size:
241
+ # Wait for any futures to complete
242
+ done, _ = wait(all_futures.keys(), return_when=FIRST_COMPLETED)
243
+
244
+ for fut in done:
245
+ f_info, start = all_futures.pop(fut)
246
+ data = fut.result()
247
+ f_info.buffers[start] = data
248
+
249
+ # Yield chunks as they are available
250
+ next_start = current_info.next_yield
251
+ while next_start in current_info.buffers:
252
+ chunk = current_info.buffers.pop(next_start)
253
+ if not isinstance(chunk, bytes):
254
+ raise InvalidDataError(
255
+ f"Expected bytes, got {type(chunk).__name__} in stream chunks: {chunk}"
256
+ )
257
+ yield chunk
258
+
259
+ next_start += range_size
260
+ current_info.next_yield = next_start
261
+
262
+ # If done with this file, stop yielding chunks for this file
263
+ if current_info.next_yield >= current_info.size:
264
+ break
265
+
266
+ yield current_file_index, chunks_generator()
267
+
268
+ current_file_index += 1
269
+
270
+
271
+ def _build_stream_response(
272
+ zip_filename: str,
273
+ files_info: list[S3FileInfo],
274
+ files_iterator: Iterator[tuple[int, Iterator[bytes]]],
275
+ compress: Literal["zip", "raw", "auto"],
276
+ executor: ThreadPoolExecutor,
277
+ ) -> StreamResponse:
278
+ """
279
+ Build a streaming HTTP response for one or multiple files from S3, supporting ZIP, raw, and multipart formats.
108
280
 
109
- zip_data = (
110
- cd + eocd if partial else fetch(bucket_name, key_name, 0, size, client_s3)
111
- )
281
+ The response format depends on the `compress` parameter and the number of files:
112
282
 
113
- zip = zipfile.ZipFile(io.BytesIO(zip_data))
283
+ - If `compress` is "zip" or "auto" with multiple files, returns a ZIP archive containing all files.
284
+ - If `compress` is "raw" and multiple files, returns a multipart/mixed response with each file as a part.
285
+ - If only one file is present and `compress` is "raw" or "auto", streams the file directly with its MIME type.
114
286
 
115
- return zip
287
+ Response formats:
116
288
 
289
+ - ZIP archive (Content-Type: application/zip) with Content-Disposition for download.
290
+ - Multipart/mixed (Content-Type: multipart/mixed; boundary=...) with each file as a part.
291
+ - Single raw file stream with its MIME type and Content-Disposition for download.
117
292
 
118
- def list_files_in_s3_zipped_object(
119
- bucket_name: str, key_name: str, client_s3: S3Client
120
- ) -> List[ZipInfo]:
293
+ :param zip_filename: Base filename to use for the ZIP archive (without extension).
294
+ :param files_info: List of S3FileInfo objects describing each file (metadata, MIME type, etc.).
295
+ :param files_iterator: Iterator yielding (file_index, chunk_iterator) for streaming file contents.
296
+ :param compress: Output format:
297
+ - "zip": Always produce a ZIP archive.
298
+ - "raw": Stream files directly, as a single file or multipart.
299
+ - "auto": ZIP if multiple files, raw if single file.
300
+ :param executor: Executor used for concurrent streaming and cleanup.
301
+ :return: Streaming HTTP response with appropriate content, headers, and media type.
121
302
  """
122
- List files in s3 zipped object, without downloading it.
123
-
124
- See https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it;
125
- Based on https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file
126
-
127
- :param bucket_name: Bucket name of the object to fetch
128
- :param key_name: Key name of the object to fetch
129
- :param client_s3: s3 client used to fetch the object
130
- :returns: List of files in zip
303
+ headers = {
304
+ "Accept-Ranges": "bytes",
305
+ }
306
+
307
+ def _wrap_generator_with_cleanup(
308
+ generator: Iterable[bytes], executor: ThreadPoolExecutor
309
+ ) -> Iterator[bytes]:
310
+ try:
311
+ yield from generator
312
+ finally:
313
+ executor.shutdown(wait=True)
314
+
315
+ def _build_response(
316
+ content_gen: Iterable[bytes],
317
+ media_type: str,
318
+ extra_headers: dict[str, str] = {},
319
+ ) -> StreamResponse:
320
+ return StreamResponse(
321
+ content=_wrap_generator_with_cleanup(content_gen, executor),
322
+ media_type=media_type,
323
+ headers={**headers, **extra_headers},
324
+ )
325
+
326
+ zip_response = (len(files_info) > 1 and compress == "auto") or compress == "zip"
327
+
328
+ if zip_response:
329
+ modified_at = datetime.now()
330
+ perms = 0o600
331
+ total_file_size = sum(f.size for f in files_info)
332
+
333
+ def zip_stream() -> Iterator[
334
+ tuple[str, datetime, int, Method, Iterable[bytes]]
335
+ ]:
336
+ for index, chunks_generator in files_iterator:
337
+ yield (
338
+ files_info[index].rel_path or files_info[index].key,
339
+ modified_at,
340
+ perms,
341
+ ZIP_AUTO(total_file_size, level=0),
342
+ chunks_generator,
343
+ )
344
+
345
+ return _build_response(
346
+ content_gen=stream_zip(zip_stream()),
347
+ media_type="application/zip",
348
+ extra_headers={
349
+ "content-disposition": f'attachment; filename="{zip_filename}.zip"'
350
+ },
351
+ )
352
+
353
+ elif len(files_info) > 1:
354
+ boundary = uuid.uuid4().hex
355
+
356
+ def multipart_stream():
357
+ current_index = -1
358
+ for index, chunks_generator in files_iterator:
359
+ if index != current_index:
360
+ if current_index != -1:
361
+ yield b"\r\n"
362
+ filename = os.path.basename(files_info[index].key)
363
+ yield (
364
+ f"--{boundary}\r\n"
365
+ f'Content-Disposition: attachment; filename="{filename}"\r\n'
366
+ f"Content-Type: {files_info[index].data_type}\r\n\r\n"
367
+ ).encode()
368
+ current_index = index
369
+ yield from chunks_generator
370
+ yield f"\r\n--{boundary}--\r\n".encode()
371
+
372
+ return _build_response(
373
+ content_gen=multipart_stream(),
374
+ media_type=f"multipart/mixed; boundary={boundary}",
375
+ )
376
+
377
+ else:
378
+ index, chunks_generator = next(files_iterator)
379
+ first_chunk = next(chunks_generator)
380
+ filename = os.path.basename(files_info[index].key)
381
+
382
+ def single_file_stream() -> Iterator[bytes]:
383
+ yield first_chunk
384
+ yield from chunks_generator
385
+
386
+ return _build_response(
387
+ content_gen=single_file_stream(),
388
+ media_type=files_info[index].data_type,
389
+ extra_headers={"content-disposition": f'attachment; filename="{filename}"'},
390
+ )
391
+
392
+
393
+ def stream_download_from_s3(
394
+ s3_client: S3Client,
395
+ files_info: list[S3FileInfo],
396
+ byte_range: tuple[Optional[int], Optional[int]] = (None, None),
397
+ compress: Literal["zip", "raw", "auto"] = "auto",
398
+ zip_filename: str = "archive",
399
+ range_size: int = 1024**2 * 8,
400
+ max_workers: int = 8,
401
+ ) -> StreamResponse:
131
402
  """
132
- with open_s3_zipped_object(bucket_name, key_name, client_s3) as zip_file:
133
- logger.debug("Found %s files in %s" % (len(zip_file.filelist), key_name))
134
- return zip_file.filelist
403
+ Stream data from one or more S3 objects in chunks, with support for global byte ranges
404
+ and partial file extraction from ZIP archives.
405
+
406
+ This function downloads product data from S3 using concurrent range requests across one or
407
+ multiple files. It divides the requested data into chunks (default: 8 MiB) and issues
408
+ parallel HTTP range requests to optimize download throughput. This is particularly useful
409
+ for large files or datasets stored across multiple S3 objects.
410
+
411
+ If the S3 key refers to a path inside a ``.zip`` file (denoted by ``.zip!<internal_path>``),
412
+ the function extracts the specified file from the archive only if it is stored uncompressed
413
+ (ZIP method = STORE). Compressed formats (like DEFLATE) are not supported for partial ZIP extraction.
414
+
415
+ The function supports global byte range filtering via the ``byte_range`` parameter, which allows
416
+ requesting only a specific portion of the logical file stream across all provided objects.
417
+
418
+ Downloads are performed concurrently using a thread pool and HTTP range requests. Each chunk is downloaded
419
+ as a separate HTTP request and yielded in file order.
420
+
421
+ The ``compress`` parameter determines the output format:
422
+
423
+ - ``zip``: Always produce a ZIP archive containing all files.
424
+ - ``raw``: Stream files directly without wrapping, either as a single file or multipart response.
425
+ - ``auto``: Automatically select the format:
426
+ - raw stream if only a single file is requested
427
+ - ZIP archive if multiple files are requested
428
+
429
+ :param s3_client: A configured S3 client capable of making range requests.
430
+ :param files_info: List of S3FileInfo objects representing the files to download.
431
+ :param byte_range: Tuple (start, end) defining the inclusive global byte range to download across all objects.
432
+ Either value can be None to indicate open-ended range.
433
+ :param compress: Determines the output format of the streamed response.
434
+ :param zip_filename: The base filename to use when producing a ZIP archive (without extension).
435
+ :param range_size: The size in bytes of each download chunk. Defaults to 8 MiB.
436
+ :param max_workers: The maximum number of concurrent download tasks. Controls the size of the thread pool.
437
+ :return: Streaming HTTP response with content according to the requested format.
438
+ :raises DownloadError: If any error occurs during streaming from S3, including missing files or
439
+ unsupported ZIP compression.
440
+ """
441
+ offset = 0
442
+
443
+ executor = ThreadPoolExecutor(max_workers=max_workers)
444
+ try:
445
+ for f_info in files_info:
446
+ # Check if file is inside a ZIP
447
+ if ".zip!" in f_info.key:
448
+ future = executor.submit(_prepare_file_in_zip, f_info, s3_client)
449
+ f_info.futures[future] = 0
450
+
451
+ for f_info in files_info:
452
+ for future in f_info.futures:
453
+ future.result()
454
+ f_info.file_start_offset = offset
455
+ offset += f_info.size
456
+
457
+ if not f_info.data_type or f_info.data_type == MIME_OCTET_STREAM:
458
+ guessed = guess_file_type(f_info.key)
459
+ f_info.data_type = guessed or MIME_OCTET_STREAM
460
+
461
+ chunks_tuple = _chunks_from_s3_objects(
462
+ s3_client,
463
+ files_info,
464
+ byte_range,
465
+ range_size,
466
+ executor,
467
+ )
468
+
469
+ return _build_stream_response(
470
+ zip_filename, files_info, chunks_tuple, compress, executor
471
+ )
472
+ except Exception as e:
473
+ executor.shutdown(wait=True)
474
+ raise DownloadError(str(e)) from e
135
475
 
136
476
 
137
477
  def update_assets_from_s3(
@@ -229,3 +569,203 @@ def update_assets_from_s3(
229
569
  raise NotAvailableError(
230
570
  f"assets for product {prefix} could not be found"
231
571
  ) from e
572
+
573
+
574
+ # ----- ZIP section -----
575
+
576
+
577
+ def open_s3_zipped_object(
578
+ bucket_name: str,
579
+ key_name: str,
580
+ s3_client,
581
+ zip_size: Optional[int] = None,
582
+ partial: bool = True,
583
+ ) -> tuple[ZipFile, bytes]:
584
+ """
585
+ Fetches the central directory and EOCD (End Of Central Directory) from an S3 object and opens a ZipFile in memory.
586
+
587
+ This function retrieves the ZIP file's central directory and EOCD by performing range requests on the S3 object.
588
+ It supports partial fetching (only the central directory and EOCD) for efficiency, or full ZIP download if needed.
589
+
590
+ :param bucket_name: Name of the S3 bucket containing the ZIP file.
591
+ :param key_name: Key (path) of the ZIP file in the S3 bucket.
592
+ :param s3_client: S3 client instance used to perform range requests.
593
+ :param zip_size: Size of the ZIP file in bytes. If None, it will be determined via a HEAD request.
594
+ :param partial: If True, only fetch the central directory and EOCD. If False, fetch the entire ZIP file.
595
+ :return: Tuple containing the opened ZipFile object and the central directory bytes.
596
+ :raises InvalidDataError: If the EOCD signature is not found in the last 64KB of the file.
597
+ """
598
+ # EOCD is at least 22 bytes, but can be longer if ZIP comment exists.
599
+ # For simplicity, we fetch last 64KB max (max EOCD + comment length allowed by ZIP spec)
600
+ if zip_size is None:
601
+ response = s3_client.head_object(Bucket=bucket_name, Key=key_name)
602
+ zip_size = int(response["ContentLength"])
603
+
604
+ fetch_size = min(65536, zip_size)
605
+ eocd_search = fetch_range(
606
+ bucket_name, key_name, zip_size - fetch_size, zip_size - 1, s3_client
607
+ )
608
+
609
+ # Find EOCD signature from end: 0x06054b50 (little endian)
610
+ eocd_signature = b"\x50\x4b\x05\x06"
611
+ eocd_offset = eocd_search.rfind(eocd_signature)
612
+ if eocd_offset == -1:
613
+ raise InvalidDataError("EOCD signature not found in last 64KB of the file.")
614
+
615
+ eocd = eocd_search[eocd_offset : eocd_offset + 22]
616
+
617
+ cd_size = parse_le_uint32(eocd[12:16])
618
+ cd_start = parse_le_uint32(eocd[16:20])
619
+
620
+ # Fetch central directory
621
+ cd_data = fetch_range(
622
+ bucket_name, key_name, cd_start, cd_start + cd_size - 1, s3_client
623
+ )
624
+
625
+ zip_data = (
626
+ cd_data + eocd
627
+ if partial
628
+ else fetch_range(bucket_name, key_name, 0, zip_size - 1, s3_client)
629
+ )
630
+ zipf = ZipFile(io.BytesIO(zip_data))
631
+ return zipf, cd_data
632
+
633
+
634
+ def _parse_central_directory_entry(cd_data: bytes, offset: int) -> dict[str, int]:
635
+ """
636
+ Parse one central directory file header entry starting at offset.
637
+ Returns dict with relative local header offset and sizes.
638
+ """
639
+ signature = cd_data[offset : offset + 4]
640
+ if signature != b"PK\x01\x02":
641
+ raise InvalidDataError("Bad central directory file header signature")
642
+
643
+ filename_len = parse_le_uint16(cd_data[offset + 28 : offset + 30])
644
+ extra_len = parse_le_uint16(cd_data[offset + 30 : offset + 32])
645
+ comment_len = parse_le_uint16(cd_data[offset + 32 : offset + 34])
646
+
647
+ relative_offset = parse_le_uint32(cd_data[offset + 42 : offset + 46])
648
+
649
+ header_size = 46 + filename_len + extra_len + comment_len
650
+
651
+ return {
652
+ "relative_offset": relative_offset,
653
+ "header_size": header_size,
654
+ "filename_len": filename_len,
655
+ "extra_len": extra_len,
656
+ "comment_len": comment_len,
657
+ "total_size": header_size,
658
+ }
659
+
660
+
661
+ def _parse_local_file_header(local_header_bytes: bytes) -> int:
662
+ """
663
+ Parse local file header to find total header size:
664
+ fixed 30 bytes + filename length + extra field length
665
+ """
666
+ if local_header_bytes[0:4] != b"PK\x03\x04":
667
+ raise InvalidDataError("Bad local file header signature")
668
+
669
+ filename_len = parse_le_uint16(local_header_bytes[26:28])
670
+ extra_len = parse_le_uint16(local_header_bytes[28:30])
671
+ total_size = 30 + filename_len + extra_len
672
+ return total_size
673
+
674
+
675
+ def file_position_from_s3_zip(
676
+ s3_bucket: str,
677
+ object_key: str,
678
+ s3_client,
679
+ target_filepath: str,
680
+ ) -> tuple[int, int]:
681
+ """
682
+ Get the start position and size of a specific file inside a ZIP archive stored in S3.
683
+ This function assumes the file is uncompressed (ZIP_STORED).
684
+
685
+ The returned tuple contains:
686
+
687
+ - **file_data_start**: The byte offset where the file data starts in the ZIP archive.
688
+ - **file_size**: The size of the file in bytes.
689
+
690
+ :param s3_bucket: The S3 bucket name.
691
+ :param object_key: The S3 object key for the ZIP file.
692
+ :param s3_client: The Boto3 S3 client.
693
+ :param target_filepath: The file path inside the ZIP archive to locate.
694
+ :return: A tuple (file_data_start, file_size)
695
+ :raises FileNotFoundError: If the target file is not found in the ZIP archive.
696
+ :raises NotImplementedError: If the file is not uncompressed (ZIP_STORED)
697
+ """
698
+ zipf, cd_data = open_s3_zipped_object(s3_bucket, object_key, s3_client)
699
+
700
+ # Find file in zipf.filelist to get its index and f_info
701
+ target_info = None
702
+ cd_offset = 0
703
+ for fi in zipf.filelist:
704
+ if fi.filename == target_filepath:
705
+ target_info = fi
706
+ break
707
+ # 46 is the fixed size (in bytes) of the Central Directory File Header according to the ZIP spec
708
+ cd_entry_len = (
709
+ 46 + len(fi.filename.encode("utf-8")) + len(fi.extra) + len(fi.comment)
710
+ )
711
+ cd_offset += cd_entry_len
712
+
713
+ zipf.close()
714
+
715
+ if target_info is None:
716
+ raise FileNotFoundError(f"File {target_filepath} not found in ZIP archive")
717
+
718
+ if target_info.compress_type != ZIP_STORED:
719
+ raise NotImplementedError("Only uncompressed files (ZIP_STORED) are supported.")
720
+
721
+ # Parse central directory entry to get relative local header offset
722
+ cd_entry = cd_data[
723
+ cd_offset : cd_offset
724
+ + (
725
+ 46
726
+ + len(target_info.filename.encode("utf-8"))
727
+ + len(target_info.extra)
728
+ + len(target_info.comment)
729
+ )
730
+ ]
731
+ cd_entry_info = _parse_central_directory_entry(cd_entry, 0)
732
+
733
+ local_header_offset = cd_entry_info["relative_offset"]
734
+
735
+ # Fetch local file header from S3 (at least 30 bytes + filename + extra field)
736
+ # We'll fetch 4 KB max to cover large filenames/extra fields safely
737
+ local_header_fetch_size = 4096
738
+ local_header_bytes = fetch_range(
739
+ s3_bucket,
740
+ object_key,
741
+ local_header_offset,
742
+ local_header_offset + local_header_fetch_size - 1,
743
+ s3_client,
744
+ )
745
+
746
+ local_header_size = _parse_local_file_header(local_header_bytes)
747
+
748
+ # Calculate file data start and end offsets
749
+ file_data_start = local_header_offset + local_header_size
750
+
751
+ return file_data_start, target_info.file_size
752
+
753
+
754
+ def list_files_in_s3_zipped_object(
755
+ bucket_name: str, key_name: str, s3_client: S3Client
756
+ ) -> list[ZipInfo]:
757
+ """
758
+ List files in s3 zipped object, without downloading it.
759
+
760
+ See https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it;
761
+ Based on https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file
762
+
763
+ :param bucket_name: Bucket name of the object to fetch
764
+ :param key_name: Key name of the object to fetch
765
+ :param s3_resource: s3 resource used to fetch the object
766
+ :returns: List of files in zip
767
+ """
768
+ zip_file, _ = open_s3_zipped_object(bucket_name, key_name, s3_client)
769
+ with zip_file:
770
+ logger.debug("Found %s files in %s" % (len(zip_file.filelist), key_name))
771
+ return zip_file.filelist