airbyte-source-s3 4.13.4__tar.gz → 4.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-s3 might be problematic. Click here for more details.

Files changed (24) hide show
  1. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/PKG-INFO +1 -1
  2. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/pyproject.toml +1 -1
  3. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/stream_reader.py +43 -13
  4. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/README.md +0 -0
  5. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/__init__.py +0 -0
  6. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/exceptions.py +0 -0
  7. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/run.py +0 -0
  8. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source.py +0 -0
  9. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/__init__.py +0 -0
  10. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/__init__.py +0 -0
  11. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/avro_spec.py +0 -0
  12. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/csv_spec.py +0 -0
  13. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/jsonl_spec.py +0 -0
  14. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/formats/parquet_spec.py +0 -0
  15. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/source.py +0 -0
  16. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/source_files_abstract/spec.py +0 -0
  17. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/stream.py +0 -0
  18. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/utils.py +0 -0
  19. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/__init__.py +0 -0
  20. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/config.py +0 -0
  21. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/cursor.py +0 -0
  22. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/legacy_config_transformer.py +0 -0
  23. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/source.py +0 -0
  24. {airbyte_source_s3-4.13.4 → airbyte_source_s3-4.14.0}/source_s3/v4/zip_reader.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-s3
3
- Version: 4.13.4
3
+ Version: 4.14.0
4
4
  Summary: Source implementation for S3.
5
5
  License: ELv2
6
6
  Author: Airbyte
@@ -5,7 +5,7 @@ requires = [
5
5
  build-backend = "poetry.core.masonry.api"
6
6
 
7
7
  [tool.poetry]
8
- version = "4.13.4"
8
+ version = "4.14.0"
9
9
  name = "airbyte-source-s3"
10
10
  description = "Source implementation for S3."
11
11
  authors = [
@@ -7,7 +7,8 @@ import time
7
7
  from datetime import datetime
8
8
  from io import IOBase
9
9
  from os import getenv
10
- from typing import Dict, Iterable, List, Optional, Set, cast
10
+ from os.path import basename, dirname
11
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, cast
11
12
 
12
13
  import boto3.session
13
14
  import pendulum
@@ -22,8 +23,10 @@ from botocore.session import get_session
22
23
  from typing_extensions import override
23
24
 
24
25
  from airbyte_cdk import FailureType
26
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
25
27
  from airbyte_cdk.sources.file_based.exceptions import CustomFileBasedException, ErrorListingFiles, FileBasedSourceError, FileSizeLimitError
26
28
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
29
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
27
30
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
28
31
  from source_s3.v4.config import Config
29
32
  from source_s3.v4.zip_reader import DecompressedStream, RemoteFileInsideArchive, ZipContentReader, ZipFileHandler
@@ -165,6 +168,19 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
165
168
  endpoint=self.config.endpoint,
166
169
  ) from exc
167
170
 
171
+ def _construct_s3_uri(self, file: RemoteFile) -> str:
172
+ """
173
+ Constructs the S3 URI for a given file, handling both regular files and files inside archives.
174
+
175
+ Args:
176
+ file: The RemoteFile object representing either a regular file or a file inside an archive
177
+
178
+ Returns:
179
+ str: The properly formatted S3 URI
180
+ """
181
+ file_path = file.uri.split("#")[0] if isinstance(file, RemoteFileInsideArchive) else file.uri
182
+ return f"s3://{self.config.bucket}/{file_path}"
183
+
168
184
  def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
169
185
  try:
170
186
  params = {"client": self.s3_client}
@@ -173,14 +189,13 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
173
189
 
174
190
  logger.debug(f"try to open {file.uri}")
175
191
  try:
192
+ s3_uri = self._construct_s3_uri(file)
176
193
  if isinstance(file, RemoteFileInsideArchive):
177
- s3_file_object = smart_open.open(f"s3://{self.config.bucket}/{file.uri.split('#')[0]}", transport_params=params, mode="rb")
194
+ s3_file_object = smart_open.open(s3_uri, transport_params=params, mode="rb")
178
195
  decompressed_stream = DecompressedStream(s3_file_object, file)
179
196
  result = ZipContentReader(decompressed_stream, encoding)
180
197
  else:
181
- result = smart_open.open(
182
- f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value, encoding=encoding
183
- )
198
+ result = smart_open.open(s3_uri, transport_params=params, mode=mode.value, encoding=encoding)
184
199
  except OSError:
185
200
  logger.warning(
186
201
  f"We don't have access to {file.uri}. The file appears to have become unreachable during sync."
@@ -221,7 +236,9 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
221
236
  return progress_handler
222
237
 
223
238
  @override
224
- def get_file(self, file: RemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, str | int]:
239
+ def upload(
240
+ self, file: RemoteFile, local_directory: str, logger: logging.Logger
241
+ ) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
225
242
  """
226
243
  Downloads a file from an S3 bucket to a specified local directory.
227
244
 
@@ -231,11 +248,7 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
231
248
  logger (logging.Logger): Logger for logging information and errors.
232
249
 
233
250
  Returns:
234
- dict: A dictionary containing the following:
235
- - "file_url" (str): The absolute path of the downloaded file.
236
- - "bytes" (int): The file size in bytes.
237
- - "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
238
- this a mounted volume in the pod container.
251
+ Tuple[FileRecordData, AirbyteRecordMessageFileReference]: Contains file record data and file reference for Airbyte protocol.
239
252
 
240
253
  Raises:
241
254
  FileSizeLimitError: If the file size exceeds the predefined limit (1 GB).
@@ -246,7 +259,10 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
246
259
  message = "File size exceeds the 1 GB limit."
247
260
  raise FileSizeLimitError(message=message, internal_message=message, failure_type=FailureType.config_error)
248
261
 
249
- file_relative_path, local_file_path, absolute_file_path = self._get_file_transfer_paths(file, local_directory)
262
+ file_paths = self._get_file_transfer_paths(file.uri, local_directory)
263
+ local_file_path = file_paths[self.LOCAL_FILE_PATH]
264
+ file_relative_path = file_paths[self.FILE_RELATIVE_PATH]
265
+ file_name = file_paths[self.FILE_NAME]
250
266
 
251
267
  logger.info(
252
268
  f"Starting to download the file {file.uri} with size: {file_size / (1024 * 1024):,.2f} MB ({file_size / (1024 * 1024 * 1024):.2f} GB)"
@@ -258,7 +274,21 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
258
274
  write_duration = time.time() - start_download_time
259
275
  logger.info(f"Finished downloading the file {file.uri} and saved to {local_file_path} in {write_duration:,.2f} seconds.")
260
276
 
261
- return {"file_url": absolute_file_path, "bytes": file_size, "file_relative_path": file_relative_path}
277
+ file_record_data = FileRecordData(
278
+ folder=file_paths[self.FILE_FOLDER],
279
+ file_name=file_name,
280
+ bytes=file_size,
281
+ updated_at=file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
282
+ source_uri=self._construct_s3_uri(file),
283
+ )
284
+
285
+ file_reference = AirbyteRecordMessageFileReference(
286
+ staging_file_url=local_file_path,
287
+ source_file_relative_path=file_relative_path,
288
+ file_size_bytes=file_size,
289
+ )
290
+
291
+ return file_record_data, file_reference
262
292
 
263
293
  @override
264
294
  def file_size(self, file: RemoteFile) -> int: