airbyte-source-s3 4.14.0.dev202504091813__tar.gz → 4.15.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/PKG-INFO +8 -6
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/pyproject.toml +13 -23
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/stream_reader.py +19 -6
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/zip_reader.py +15 -1
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/README.md +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/__init__.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/exceptions.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/run.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/__init__.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/__init__.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/avro_spec.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/csv_spec.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/jsonl_spec.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/parquet_spec.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/source.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/spec.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/stream.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/utils.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/__init__.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/config.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/cursor.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/legacy_config_transformer.py +0 -0
- {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/source.py +0 -0
|
@@ -1,25 +1,27 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: airbyte-source-s3
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.15.1
|
|
4
4
|
Summary: Source implementation for S3.
|
|
5
|
+
Home-page: https://airbyte.com
|
|
5
6
|
License: ELv2
|
|
6
7
|
Author: Airbyte
|
|
7
8
|
Author-email: contact@airbyte.io
|
|
8
|
-
Requires-Python: >=3.10,<3.
|
|
9
|
+
Requires-Python: >=3.10,<3.14
|
|
9
10
|
Classifier: License :: Other/Proprietary License
|
|
10
11
|
Classifier: Programming Language :: Python :: 3
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
-
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: airbyte-cdk[file-based] (>=7.0.4,<8.0.0)
|
|
14
17
|
Requires-Dist: dill (>=0.3.4,<0.4.0)
|
|
15
18
|
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
16
19
|
Requires-Dist: pytz (>=2024.2,<2025.0)
|
|
17
|
-
Requires-Dist: smart-open[s3] (
|
|
20
|
+
Requires-Dist: smart-open[s3] (==4.15.1)
|
|
18
21
|
Requires-Dist: transformers (>=4.38.2,<5.0.0)
|
|
19
22
|
Requires-Dist: urllib3 (<2)
|
|
20
23
|
Requires-Dist: wcmatch (>=10.0,<11.0)
|
|
21
24
|
Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/s3
|
|
22
|
-
Project-URL: Homepage, https://airbyte.com
|
|
23
25
|
Project-URL: Repository, https://github.com/airbytehq/airbyte
|
|
24
26
|
Description-Content-Type: text/markdown
|
|
25
27
|
|
|
@@ -1,49 +1,37 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = [
|
|
3
|
-
"poetry-core>=1.0.0",
|
|
4
|
-
]
|
|
2
|
+
requires = [ "poetry-core>=1.0.0",]
|
|
5
3
|
build-backend = "poetry.core.masonry.api"
|
|
6
4
|
|
|
7
5
|
[tool.poetry]
|
|
8
|
-
version = "4.
|
|
6
|
+
version = "4.15.1"
|
|
9
7
|
name = "airbyte-source-s3"
|
|
10
8
|
description = "Source implementation for S3."
|
|
11
|
-
authors = [
|
|
12
|
-
"Airbyte <contact@airbyte.io>",
|
|
13
|
-
]
|
|
9
|
+
authors = [ "Airbyte <contact@airbyte.io>",]
|
|
14
10
|
license = "ELv2"
|
|
15
11
|
readme = "README.md"
|
|
16
12
|
documentation = "https://docs.airbyte.com/integrations/sources/s3"
|
|
17
13
|
homepage = "https://airbyte.com"
|
|
18
14
|
repository = "https://github.com/airbytehq/airbyte"
|
|
19
|
-
packages
|
|
20
|
-
|
|
21
|
-
]
|
|
15
|
+
[[tool.poetry.packages]]
|
|
16
|
+
include = "source_s3"
|
|
22
17
|
|
|
23
18
|
[tool.poetry.dependencies]
|
|
24
|
-
python = "^3.10,<3.
|
|
19
|
+
python = "^3.10,<3.14"
|
|
25
20
|
pytz = "^2024.2"
|
|
26
21
|
wcmatch = "^10.0"
|
|
27
22
|
dill = "^0.3.4"
|
|
28
23
|
transformers = "^4.38.2"
|
|
29
24
|
urllib3 = "<2"
|
|
25
|
+
airbyte-cdk = {extras = ["file-based"], version = "^7.0.4"}
|
|
30
26
|
pendulum = "^3.0.0"
|
|
31
27
|
|
|
32
|
-
[tool.poetry.dependencies.airbyte-cdk]
|
|
33
|
-
extras = [
|
|
34
|
-
"file-based",
|
|
35
|
-
]
|
|
36
|
-
version = "6.45.0.dev04107"
|
|
37
|
-
|
|
38
|
-
[tool.poetry.dependencies.smart-open]
|
|
39
|
-
extras = [
|
|
40
|
-
"s3",
|
|
41
|
-
]
|
|
42
|
-
version = "^5.1.0"
|
|
43
|
-
|
|
44
28
|
[tool.poetry.scripts]
|
|
45
29
|
source-s3 = "source_s3.run:run"
|
|
46
30
|
|
|
31
|
+
[tool.poetry.dependencies.smart-open]
|
|
32
|
+
extras = [ "s3",]
|
|
33
|
+
version = "4.15.1"
|
|
34
|
+
|
|
47
35
|
[tool.poetry.group.dev.dependencies]
|
|
48
36
|
pytest = "^8.0.0"
|
|
49
37
|
moto = "==4.2.14"
|
|
@@ -54,5 +42,7 @@ pandas = "^2.0.3"
|
|
|
54
42
|
|
|
55
43
|
[tool.poe]
|
|
56
44
|
include = [
|
|
45
|
+
# Shared tasks definition file(s) can be imported here.
|
|
46
|
+
# Run `poe` or `poe --help` to see the list of available tasks.
|
|
57
47
|
"${POE_GIT_DIR}/poe-tasks/poetry-connector-tasks.toml",
|
|
58
48
|
]
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/stream_reader.py
RENAMED
|
@@ -168,6 +168,19 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
168
168
|
endpoint=self.config.endpoint,
|
|
169
169
|
) from exc
|
|
170
170
|
|
|
171
|
+
def _construct_s3_uri(self, file: RemoteFile) -> str:
|
|
172
|
+
"""
|
|
173
|
+
Constructs the S3 URI for a given file, handling both regular files and files inside archives.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
file: The RemoteFile object representing either a regular file or a file inside an archive
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
str: The properly formatted S3 URI
|
|
180
|
+
"""
|
|
181
|
+
file_path = file.uri.split("#")[0] if isinstance(file, RemoteFileInsideArchive) else file.uri
|
|
182
|
+
return f"s3://{self.config.bucket}/{file_path}"
|
|
183
|
+
|
|
171
184
|
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
|
|
172
185
|
try:
|
|
173
186
|
params = {"client": self.s3_client}
|
|
@@ -176,14 +189,13 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
176
189
|
|
|
177
190
|
logger.debug(f"try to open {file.uri}")
|
|
178
191
|
try:
|
|
192
|
+
s3_uri = self._construct_s3_uri(file)
|
|
179
193
|
if isinstance(file, RemoteFileInsideArchive):
|
|
180
|
-
s3_file_object = smart_open.open(
|
|
194
|
+
s3_file_object = smart_open.open(s3_uri, transport_params=params, mode="rb")
|
|
181
195
|
decompressed_stream = DecompressedStream(s3_file_object, file)
|
|
182
196
|
result = ZipContentReader(decompressed_stream, encoding)
|
|
183
197
|
else:
|
|
184
|
-
result = smart_open.open(
|
|
185
|
-
f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value, encoding=encoding
|
|
186
|
-
)
|
|
198
|
+
result = smart_open.open(s3_uri, transport_params=params, mode=mode.value, encoding=encoding)
|
|
187
199
|
except OSError:
|
|
188
200
|
logger.warning(
|
|
189
201
|
f"We don't have access to {file.uri}. The file appears to have become unreachable during sync."
|
|
@@ -247,7 +259,7 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
247
259
|
message = "File size exceeds the 1 GB limit."
|
|
248
260
|
raise FileSizeLimitError(message=message, internal_message=message, failure_type=FailureType.config_error)
|
|
249
261
|
|
|
250
|
-
file_paths = self._get_file_transfer_paths(file, local_directory)
|
|
262
|
+
file_paths = self._get_file_transfer_paths(file.uri, local_directory)
|
|
251
263
|
local_file_path = file_paths[self.LOCAL_FILE_PATH]
|
|
252
264
|
file_relative_path = file_paths[self.FILE_RELATIVE_PATH]
|
|
253
265
|
file_name = file_paths[self.FILE_NAME]
|
|
@@ -264,9 +276,10 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
|
|
|
264
276
|
|
|
265
277
|
file_record_data = FileRecordData(
|
|
266
278
|
folder=file_paths[self.FILE_FOLDER],
|
|
267
|
-
|
|
279
|
+
file_name=file_name,
|
|
268
280
|
bytes=file_size,
|
|
269
281
|
updated_at=file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
282
|
+
source_uri=self._construct_s3_uri(file),
|
|
270
283
|
)
|
|
271
284
|
|
|
272
285
|
file_reference = AirbyteRecordMessageFileReference(
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/zip_reader.py
RENAMED
|
@@ -366,7 +366,21 @@ class ZipContentReader:
|
|
|
366
366
|
data = self.buffer[:size]
|
|
367
367
|
self.buffer = self.buffer[size:]
|
|
368
368
|
|
|
369
|
-
|
|
369
|
+
try:
|
|
370
|
+
return data.decode(self.encoding) if self.encoding else bytes(data)
|
|
371
|
+
except UnicodeDecodeError:
|
|
372
|
+
if self.encoding == "utf_8_sig":
|
|
373
|
+
# utf_8_sig considers `\xef\xbb\xbf` as a single character and therefore calling `bytearray(b'\xef').decode("utf_8_sig") will
|
|
374
|
+
# cause an exception to be raised.
|
|
375
|
+
number_of_bytes_to_add = size - 1
|
|
376
|
+
if data.endswith(bytearray(b"\xef")):
|
|
377
|
+
number_of_bytes_to_add += 2
|
|
378
|
+
elif data.endswith(bytearray(b"\xbb")):
|
|
379
|
+
number_of_bytes_to_add += 1
|
|
380
|
+
data = data + self.buffer[:number_of_bytes_to_add]
|
|
381
|
+
self.buffer = self.buffer[number_of_bytes_to_add:]
|
|
382
|
+
return data.decode(self.encoding) if self.encoding else bytes(data)
|
|
383
|
+
raise
|
|
370
384
|
|
|
371
385
|
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
|
|
372
386
|
"""
|
|
File without changes
|
|
File without changes
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/__init__.py
RENAMED
|
File without changes
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/config.py
RENAMED
|
File without changes
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/cursor.py
RENAMED
|
File without changes
|
|
File without changes
|
{airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/source.py
RENAMED
|
File without changes
|