airbyte-source-s3 4.13.4__tar.gz → 4.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of airbyte-source-s3 might be problematic. Click here for more details.
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/PKG-INFO +1 -1
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/pyproject.toml +1 -1
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/stream_reader.py +43 -13
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/README.md +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/__init__.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/exceptions.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/run.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/__init__.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/__init__.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/avro_spec.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/csv_spec.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/jsonl_spec.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/parquet_spec.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/source.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/spec.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/stream.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/utils.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/__init__.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/config.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/cursor.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/legacy_config_transformer.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/source.py +0 -0
- {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/zip_reader.py +0 -0
|
@@ -7,7 +7,8 @@ import time
|
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from io import IOBase
|
|
9
9
|
from os import getenv
|
|
10
|
-
from
|
|
10
|
+
from os.path import basename, dirname
|
|
11
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, cast
|
|
11
12
|
|
|
12
13
|
import boto3.session
|
|
13
14
|
import pendulum
|
|
@@ -22,8 +23,10 @@ from botocore.session import get_session
|
|
|
22
23
|
from typing_extensions import override
|
|
23
24
|
|
|
24
25
|
from airbyte_cdk import FailureType
|
|
26
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
|
25
27
|
from airbyte_cdk.sources.file_based.exceptions import CustomFileBasedException, ErrorListingFiles, FileBasedSourceError, FileSizeLimitError
|
|
26
28
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
|
29
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
|
27
30
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
28
31
|
from source_s3.v4.config import Config
|
|
29
32
|
from source_s3.v4.zip_reader import DecompressedStream, RemoteFileInsideArchive, ZipContentReader, ZipFileHandler
|
|
@@ -165,6 +168,19 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
165
168
|
endpoint=self.config.endpoint,
|
|
166
169
|
) from exc
|
|
167
170
|
|
|
171
|
+
def _construct_s3_uri(self, file: RemoteFile) -> str:
|
|
172
|
+
"""
|
|
173
|
+
Constructs the S3 URI for a given file, handling both regular files and files inside archives.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
file: The RemoteFile object representing either a regular file or a file inside an archive
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
str: The properly formatted S3 URI
|
|
180
|
+
"""
|
|
181
|
+
file_path = file.uri.split("#")[0] if isinstance(file, RemoteFileInsideArchive) else file.uri
|
|
182
|
+
return f"s3://{self.config.bucket}/{file_path}"
|
|
183
|
+
|
|
168
184
|
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
|
|
169
185
|
try:
|
|
170
186
|
params = {"client": self.s3_client}
|
|
@@ -173,14 +189,13 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
173
189
|
|
|
174
190
|
logger.debug(f"try to open {file.uri}")
|
|
175
191
|
try:
|
|
192
|
+
s3_uri = self._construct_s3_uri(file)
|
|
176
193
|
if isinstance(file, RemoteFileInsideArchive):
|
|
177
|
-
s3_file_object = smart_open.open(
|
|
194
|
+
s3_file_object = smart_open.open(s3_uri, transport_params=params, mode="rb")
|
|
178
195
|
decompressed_stream = DecompressedStream(s3_file_object, file)
|
|
179
196
|
result = ZipContentReader(decompressed_stream, encoding)
|
|
180
197
|
else:
|
|
181
|
-
result = smart_open.open(
|
|
182
|
-
f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value, encoding=encoding
|
|
183
|
-
)
|
|
198
|
+
result = smart_open.open(s3_uri, transport_params=params, mode=mode.value, encoding=encoding)
|
|
184
199
|
except OSError:
|
|
185
200
|
logger.warning(
|
|
186
201
|
f"We don't have access to {file.uri}. The file appears to have become unreachable during sync."
|
|
@@ -221,7 +236,9 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
221
236
|
return progress_handler
|
|
222
237
|
|
|
223
238
|
@override
|
|
224
|
-
def
|
|
239
|
+
def upload(
|
|
240
|
+
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
|
241
|
+
) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
|
|
225
242
|
"""
|
|
226
243
|
Downloads a file from an S3 bucket to a specified local directory.
|
|
227
244
|
|
|
@@ -231,11 +248,7 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
231
248
|
logger (logging.Logger): Logger for logging information and errors.
|
|
232
249
|
|
|
233
250
|
Returns:
|
|
234
|
-
|
|
235
|
-
- "file_url" (str): The absolute path of the downloaded file.
|
|
236
|
-
- "bytes" (int): The file size in bytes.
|
|
237
|
-
- "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
|
|
238
|
-
this a mounted volume in the pod container.
|
|
251
|
+
Tuple[FileRecordData, AirbyteRecordMessageFileReference]: Contains file record data and file reference for Airbyte protocol.
|
|
239
252
|
|
|
240
253
|
Raises:
|
|
241
254
|
FileSizeLimitError: If the file size exceeds the predefined limit (1 GB).
|
|
@@ -246,7 +259,10 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
246
259
|
message = "File size exceeds the 1 GB limit."
|
|
247
260
|
raise FileSizeLimitError(message=message, internal_message=message, failure_type=FailureType.config_error)
|
|
248
261
|
|
|
249
|
-
|
|
262
|
+
file_paths = self._get_file_transfer_paths(file.uri, local_directory)
|
|
263
|
+
local_file_path = file_paths[self.LOCAL_FILE_PATH]
|
|
264
|
+
file_relative_path = file_paths[self.FILE_RELATIVE_PATH]
|
|
265
|
+
file_name = file_paths[self.FILE_NAME]
|
|
250
266
|
|
|
251
267
|
logger.info(
|
|
252
268
|
f"Starting to download the file {file.uri} with size: {file_size / (1024 * 1024):,.2f} MB ({file_size / (1024 * 1024 * 1024):.2f} GB)"
|
|
@@ -258,7 +274,21 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
258
274
|
write_duration = time.time() - start_download_time
|
|
259
275
|
logger.info(f"Finished downloading the file {file.uri} and saved to {local_file_path} in {write_duration:,.2f} seconds.")
|
|
260
276
|
|
|
261
|
-
|
|
277
|
+
file_record_data = FileRecordData(
|
|
278
|
+
folder=file_paths[self.FILE_FOLDER],
|
|
279
|
+
file_name=file_name,
|
|
280
|
+
bytes=file_size,
|
|
281
|
+
updated_at=file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
282
|
+
source_uri=self._construct_s3_uri(file),
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
file_reference = AirbyteRecordMessageFileReference(
|
|
286
|
+
staging_file_url=local_file_path,
|
|
287
|
+
source_file_relative_path=file_relative_path,
|
|
288
|
+
file_size_bytes=file_size,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return file_record_data, file_reference
|
|
262
292
|
|
|
263
293
|
@override
|
|
264
294
|
def file_size(self, file: RemoteFile) -> int:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/source.py
RENAMED
|
File without changes
|
{airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/spec.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/legacy_config_transformer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|