airbyte-source-s3 4.14.0.dev202504091813__tar.gz → 4.15.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/PKG-INFO +8 -6
  2. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/pyproject.toml +13 -23
  3. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/stream_reader.py +19 -6
  4. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/zip_reader.py +15 -1
  5. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/README.md +0 -0
  6. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/__init__.py +0 -0
  7. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/exceptions.py +0 -0
  8. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/run.py +0 -0
  9. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source.py +0 -0
  10. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/__init__.py +0 -0
  11. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/__init__.py +0 -0
  12. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/avro_spec.py +0 -0
  13. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/csv_spec.py +0 -0
  14. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/jsonl_spec.py +0 -0
  15. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/formats/parquet_spec.py +0 -0
  16. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/source.py +0 -0
  17. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/source_files_abstract/spec.py +0 -0
  18. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/stream.py +0 -0
  19. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/utils.py +0 -0
  20. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/__init__.py +0 -0
  21. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/config.py +0 -0
  22. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/cursor.py +0 -0
  23. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/legacy_config_transformer.py +0 -0
  24. {airbyte_source_s3-4.14.0.dev202504091813 → airbyte_source_s3-4.15.1}/source_s3/v4/source.py +0 -0
@@ -1,25 +1,27 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: airbyte-source-s3
3
- Version: 4.14.0.dev202504091813
3
+ Version: 4.15.1
4
4
  Summary: Source implementation for S3.
5
+ Home-page: https://airbyte.com
5
6
  License: ELv2
6
7
  Author: Airbyte
7
8
  Author-email: contact@airbyte.io
8
- Requires-Python: >=3.10,<3.12
9
+ Requires-Python: >=3.10,<3.14
9
10
  Classifier: License :: Other/Proprietary License
10
11
  Classifier: Programming Language :: Python :: 3
11
12
  Classifier: Programming Language :: Python :: 3.10
12
13
  Classifier: Programming Language :: Python :: 3.11
13
- Requires-Dist: airbyte-cdk[file-based] (==6.45.0.dev04107)
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: airbyte-cdk[file-based] (>=7.0.4,<8.0.0)
14
17
  Requires-Dist: dill (>=0.3.4,<0.4.0)
15
18
  Requires-Dist: pendulum (>=3.0.0,<4.0.0)
16
19
  Requires-Dist: pytz (>=2024.2,<2025.0)
17
- Requires-Dist: smart-open[s3] (>=5.1.0,<6.0.0)
20
+ Requires-Dist: smart-open[s3] (==4.15.1)
18
21
  Requires-Dist: transformers (>=4.38.2,<5.0.0)
19
22
  Requires-Dist: urllib3 (<2)
20
23
  Requires-Dist: wcmatch (>=10.0,<11.0)
21
24
  Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/s3
22
- Project-URL: Homepage, https://airbyte.com
23
25
  Project-URL: Repository, https://github.com/airbytehq/airbyte
24
26
  Description-Content-Type: text/markdown
25
27
 
@@ -1,49 +1,37 @@
1
1
  [build-system]
2
- requires = [
3
- "poetry-core>=1.0.0",
4
- ]
2
+ requires = [ "poetry-core>=1.0.0",]
5
3
  build-backend = "poetry.core.masonry.api"
6
4
 
7
5
  [tool.poetry]
8
- version = "4.14.0.dev202504091813"
6
+ version = "4.15.1"
9
7
  name = "airbyte-source-s3"
10
8
  description = "Source implementation for S3."
11
- authors = [
12
- "Airbyte <contact@airbyte.io>",
13
- ]
9
+ authors = [ "Airbyte <contact@airbyte.io>",]
14
10
  license = "ELv2"
15
11
  readme = "README.md"
16
12
  documentation = "https://docs.airbyte.com/integrations/sources/s3"
17
13
  homepage = "https://airbyte.com"
18
14
  repository = "https://github.com/airbytehq/airbyte"
19
- packages = [
20
- { include = "source_s3" },
21
- ]
15
+ [[tool.poetry.packages]]
16
+ include = "source_s3"
22
17
 
23
18
  [tool.poetry.dependencies]
24
- python = "^3.10,<3.12"
19
+ python = "^3.10,<3.14"
25
20
  pytz = "^2024.2"
26
21
  wcmatch = "^10.0"
27
22
  dill = "^0.3.4"
28
23
  transformers = "^4.38.2"
29
24
  urllib3 = "<2"
25
+ airbyte-cdk = {extras = ["file-based"], version = "^7.0.4"}
30
26
  pendulum = "^3.0.0"
31
27
 
32
- [tool.poetry.dependencies.airbyte-cdk]
33
- extras = [
34
- "file-based",
35
- ]
36
- version = "6.45.0.dev04107"
37
-
38
- [tool.poetry.dependencies.smart-open]
39
- extras = [
40
- "s3",
41
- ]
42
- version = "^5.1.0"
43
-
44
28
  [tool.poetry.scripts]
45
29
  source-s3 = "source_s3.run:run"
46
30
 
31
+ [tool.poetry.dependencies.smart-open]
32
+ extras = [ "s3",]
33
+ version = "4.15.1"
34
+
47
35
  [tool.poetry.group.dev.dependencies]
48
36
  pytest = "^8.0.0"
49
37
  moto = "==4.2.14"
@@ -54,5 +42,7 @@ pandas = "^2.0.3"
54
42
 
55
43
  [tool.poe]
56
44
  include = [
45
+ # Shared tasks definition file(s) can be imported here.
46
+ # Run `poe` or `poe --help` to see the list of available tasks.
57
47
  "${POE_GIT_DIR}/poe-tasks/poetry-connector-tasks.toml",
58
48
  ]
@@ -168,6 +168,19 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
168
168
  endpoint=self.config.endpoint,
169
169
  ) from exc
170
170
 
171
+ def _construct_s3_uri(self, file: RemoteFile) -> str:
172
+ """
173
+ Constructs the S3 URI for a given file, handling both regular files and files inside archives.
174
+
175
+ Args:
176
+ file: The RemoteFile object representing either a regular file or a file inside an archive
177
+
178
+ Returns:
179
+ str: The properly formatted S3 URI
180
+ """
181
+ file_path = file.uri.split("#")[0] if isinstance(file, RemoteFileInsideArchive) else file.uri
182
+ return f"s3://{self.config.bucket}/{file_path}"
183
+
171
184
  def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
172
185
  try:
173
186
  params = {"client": self.s3_client}
@@ -176,14 +189,13 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
176
189
 
177
190
  logger.debug(f"try to open {file.uri}")
178
191
  try:
192
+ s3_uri = self._construct_s3_uri(file)
179
193
  if isinstance(file, RemoteFileInsideArchive):
180
- s3_file_object = smart_open.open(f"s3://{self.config.bucket}/{file.uri.split('#')[0]}", transport_params=params, mode="rb")
194
+ s3_file_object = smart_open.open(s3_uri, transport_params=params, mode="rb")
181
195
  decompressed_stream = DecompressedStream(s3_file_object, file)
182
196
  result = ZipContentReader(decompressed_stream, encoding)
183
197
  else:
184
- result = smart_open.open(
185
- f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value, encoding=encoding
186
- )
198
+ result = smart_open.open(s3_uri, transport_params=params, mode=mode.value, encoding=encoding)
187
199
  except OSError:
188
200
  logger.warning(
189
201
  f"We don't have access to {file.uri}. The file appears to have become unreachable during sync."
@@ -247,7 +259,7 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
247
259
  message = "File size exceeds the 1 GB limit."
248
260
  raise FileSizeLimitError(message=message, internal_message=message, failure_type=FailureType.config_error)
249
261
 
250
- file_paths = self._get_file_transfer_paths(file, local_directory)
262
+ file_paths = self._get_file_transfer_paths(file.uri, local_directory)
251
263
  local_file_path = file_paths[self.LOCAL_FILE_PATH]
252
264
  file_relative_path = file_paths[self.FILE_RELATIVE_PATH]
253
265
  file_name = file_paths[self.FILE_NAME]
@@ -264,9 +276,10 @@ class SourceS3StreamReader(AbstractFileBasedStreamReader):
264
276
 
265
277
  file_record_data = FileRecordData(
266
278
  folder=file_paths[self.FILE_FOLDER],
267
- filename=file_name,
279
+ file_name=file_name,
268
280
  bytes=file_size,
269
281
  updated_at=file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
282
+ source_uri=self._construct_s3_uri(file),
270
283
  )
271
284
 
272
285
  file_reference = AirbyteRecordMessageFileReference(
@@ -366,7 +366,21 @@ class ZipContentReader:
366
366
  data = self.buffer[:size]
367
367
  self.buffer = self.buffer[size:]
368
368
 
369
- return data.decode(self.encoding) if self.encoding else bytes(data)
369
+ try:
370
+ return data.decode(self.encoding) if self.encoding else bytes(data)
371
+ except UnicodeDecodeError:
372
+ if self.encoding == "utf_8_sig":
373
+ # utf_8_sig considers `\xef\xbb\xbf` as a single character and therefore calling `bytearray(b'\xef').decode("utf_8_sig") will
374
+ # cause an exception to be raised.
375
+ number_of_bytes_to_add = size - 1
376
+ if data.endswith(bytearray(b"\xef")):
377
+ number_of_bytes_to_add += 2
378
+ elif data.endswith(bytearray(b"\xbb")):
379
+ number_of_bytes_to_add += 1
380
+ data = data + self.buffer[:number_of_bytes_to_add]
381
+ self.buffer = self.buffer[number_of_bytes_to_add:]
382
+ return data.decode(self.encoding) if self.encoding else bytes(data)
383
+ raise
370
384
 
371
385
  def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
372
386
  """