airbyte-cdk 7.3.0__py3-none-any.whl → 7.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,13 +3,15 @@
3
3
  #
4
4
 
5
5
  import logging
6
+ import time
6
7
  from abc import ABC, abstractmethod
7
8
  from datetime import datetime
8
9
  from enum import Enum
9
10
  from io import IOBase
10
11
  from os import makedirs, path
11
- from typing import Any, Callable, Iterable, List, MutableMapping, Optional, Set, Tuple
12
+ from typing import Any, Iterable, List, MutableMapping, Optional, Set, Tuple
12
13
 
14
+ from airbyte_protocol_dataclasses.models import FailureType
13
15
  from wcmatch.glob import GLOBSTAR, globmatch
14
16
 
15
17
  from airbyte_cdk.models import AirbyteRecordMessageFileReference
@@ -19,8 +21,9 @@ from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import
19
21
  preserve_directory_structure,
20
22
  use_file_transfer,
21
23
  )
24
+ from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
22
25
  from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
23
- from airbyte_cdk.sources.file_based.remote_file import RemoteFile
26
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile
24
27
 
25
28
 
26
29
  class FileReadMode(Enum):
@@ -34,6 +37,7 @@ class AbstractFileBasedStreamReader(ABC):
34
37
  FILE_NAME = "file_name"
35
38
  LOCAL_FILE_PATH = "local_file_path"
36
39
  FILE_FOLDER = "file_folder"
40
+ FILE_SIZE_LIMIT = 1_500_000_000
37
41
 
38
42
  def __init__(self) -> None:
39
43
  self._config = None
@@ -113,16 +117,6 @@ class AbstractFileBasedStreamReader(ABC):
113
117
  seen.add(file.uri)
114
118
  yield file
115
119
 
116
- @abstractmethod
117
- def file_size(self, file: RemoteFile) -> int:
118
- """Utility method to get size of the remote file.
119
-
120
- This is required for connectors that will support writing to
121
- files. If the connector does not support writing files, then the
122
- subclass can simply `return 0`.
123
- """
124
- ...
125
-
126
120
  @staticmethod
127
121
  def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
128
122
  # Use the GLOBSTAR flag to enable recursive ** matching
@@ -153,9 +147,8 @@ class AbstractFileBasedStreamReader(ABC):
153
147
  return include_identities_stream(self.config)
154
148
  return False
155
149
 
156
- @abstractmethod
157
150
  def upload(
158
- self, file: RemoteFile, local_directory: str, logger: logging.Logger
151
+ self, file: UploadableRemoteFile, local_directory: str, logger: logging.Logger
159
152
  ) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
160
153
  """
161
154
  This is required for connectors that will support writing to
@@ -173,7 +166,53 @@ class AbstractFileBasedStreamReader(ABC):
173
166
  - file_size_bytes (int): The size of the referenced file in bytes.
174
167
  - source_file_relative_path (str): The relative path to the referenced file in source.
175
168
  """
176
- ...
169
+ if not isinstance(file, UploadableRemoteFile):
170
+ raise TypeError(f"Expected UploadableRemoteFile, got {type(file)}")
171
+
172
+ file_size = file.size
173
+
174
+ if file_size > self.FILE_SIZE_LIMIT:
175
+ message = f"File size exceeds the {self.FILE_SIZE_LIMIT / 1e9} GB limit."
176
+ raise FileSizeLimitError(
177
+ message=message, internal_message=message, failure_type=FailureType.config_error
178
+ )
179
+
180
+ file_paths = self._get_file_transfer_paths(
181
+ source_file_relative_path=file.source_file_relative_path,
182
+ staging_directory=local_directory,
183
+ )
184
+ local_file_path = file_paths[self.LOCAL_FILE_PATH]
185
+ file_relative_path = file_paths[self.FILE_RELATIVE_PATH]
186
+ file_name = file_paths[self.FILE_NAME]
187
+
188
+ logger.info(
189
+ f"Starting to download the file {file.file_uri_for_logging} with size: {file_size / (1024 * 1024):,.2f} MB ({file_size / (1024 * 1024 * 1024):.2f} GB)"
190
+ )
191
+ start_download_time = time.time()
192
+
193
+ file.download_to_local_directory(local_file_path)
194
+
195
+ write_duration = time.time() - start_download_time
196
+ logger.info(
197
+ f"Finished downloading the file {file.file_uri_for_logging} and saved to {local_file_path} in {write_duration:,.2f} seconds."
198
+ )
199
+
200
+ file_record_data = FileRecordData(
201
+ folder=file_paths[self.FILE_FOLDER],
202
+ file_name=file_name,
203
+ bytes=file_size,
204
+ id=file.id,
205
+ mime_type=file.mime_type,
206
+ created_at=file.created_at,
207
+ updated_at=file.updated_at,
208
+ source_uri=file.uri,
209
+ )
210
+ file_reference = AirbyteRecordMessageFileReference(
211
+ staging_file_url=local_file_path,
212
+ source_file_relative_path=file_relative_path,
213
+ file_size_bytes=file_size,
214
+ )
215
+ return file_record_data, file_reference
177
216
 
178
217
  def _get_file_transfer_paths(
179
218
  self, source_file_relative_path: str, staging_directory: str
@@ -7,7 +7,7 @@ from typing import Iterable, Tuple
7
7
  from airbyte_cdk.models import AirbyteRecordMessageFileReference
8
8
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
9
9
  from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
10
- from airbyte_cdk.sources.file_based.remote_file import RemoteFile
10
+ from airbyte_cdk.sources.file_based.remote_file import UploadableRemoteFile
11
11
  from airbyte_cdk.sources.utils.files_directory import get_files_directory
12
12
 
13
13
 
@@ -17,7 +17,7 @@ class FileTransfer:
17
17
 
18
18
  def upload(
19
19
  self,
20
- file: RemoteFile,
20
+ file: UploadableRemoteFile,
21
21
  stream_reader: AbstractFileBasedStreamReader,
22
22
  logger: logging.Logger,
23
23
  ) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
4
+ from abc import ABC, abstractmethod
5
5
  from datetime import datetime
6
6
  from typing import Optional
7
7
 
@@ -16,3 +16,42 @@ class RemoteFile(BaseModel):
16
16
  uri: str
17
17
  last_modified: datetime
18
18
  mime_type: Optional[str] = None
19
+
20
+
21
+ class UploadableRemoteFile(RemoteFile, ABC):
22
+ """
23
+ A file in a file-based stream that supports uploading(file transferring).
24
+ """
25
+
26
+ id: Optional[str] = None
27
+ created_at: Optional[str] = None
28
+ updated_at: Optional[str] = None
29
+
30
+ @property
31
+ @abstractmethod
32
+ def size(self) -> int:
33
+ """
34
+ Returns the file size in bytes.
35
+ """
36
+ ...
37
+
38
+ @abstractmethod
39
+ def download_to_local_directory(self, local_file_path: str) -> None:
40
+ """
41
+ Download the file from remote source to local storage.
42
+ """
43
+ ...
44
+
45
+ @property
46
+ def source_file_relative_path(self) -> str:
47
+ """
48
+ Returns the relative path of the source file.
49
+ """
50
+ return self.uri
51
+
52
+ @property
53
+ def file_uri_for_logging(self) -> str:
54
+ """
55
+ Returns the URI for the file being logged.
56
+ """
57
+ return self.uri
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 7.3.0
3
+ Version: 7.3.1
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://airbyte.com
6
6
  License: MIT
@@ -298,18 +298,18 @@ airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha2
298
298
  airbyte_cdk/sources/file_based/exceptions.py,sha256=WP0qkG6fpWoBpOyyicgp5YNE393VWyegq5qSy0v4QtM,7362
299
299
  airbyte_cdk/sources/file_based/file_based_source.py,sha256=Xg8OYWnGc-OcVBglvS08uwAWGWHBhEqsBnyODIkOK-4,20051
300
300
  airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py,sha256=4e7FXqQ9hueacexC0SyrZyjF8oREYHza8pKF9CgKbD8,5050
301
- airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=rwz8AhEIqYB9gBF7uW9eR--eUiHOntzuwLH8jFHNacE,7854
301
+ airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=Yg9KRXpyAtElBrUOO8oX4WHQH6k6Lk7keklrZmB5Klg,9614
302
302
  airbyte_cdk/sources/file_based/file_record_data.py,sha256=Vkr5AyZzlsOezjVCLhFrm_WpymlQdolWCnFAwqLJ9Iw,453
303
303
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=blCLn0-2LC-ZdgcNyDEhqM2RiUvEjEBh-G4-t32ZtuM,1268
304
304
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=USEYqiICXBWpDV443VtNOCmUA-GINzY_Zah74_5w3qQ,10860
305
305
  airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=QlCXB-ry3np67Q_VerQEPoWDOTcPTB6Go4ydZxY9ae4,20445
306
306
  airbyte_cdk/sources/file_based/file_types/excel_parser.py,sha256=BeplCq0hmojELU6bZCvvpRLpQ9us81TqbGYwrhd3INo,7188
307
- airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=5l2Jo6bp6neDmgM427PrZMZeqU0hCIZVWnzUZ_7BT10,1100
307
+ airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=rFxWaqItBux9tPf4xU03LT6b-wDZf1QolM92mP8Diuk,1120
308
308
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
309
309
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=GwyNyxmST4RX-XpXy7xVH0D-znYWWBmGv_pVAu95oHQ,5886
310
310
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=XenFg5sJ-UBnIkSmsiNJRou11NO0zZXx-RXgPHMT2NA,10487
311
311
  airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=2TYOQl62FQPCa8otLbkDIk_j01EP3oWaKSfXGhCjCHg,19492
312
- airbyte_cdk/sources/file_based/remote_file.py,sha256=yqRz93vPe8PBXLIMJ5W5u2JRlZRhg6sBrAjn3pPjJ8A,315
312
+ airbyte_cdk/sources/file_based/remote_file.py,sha256=1Afzr2WFWwjiUz8R2vNFepeI192UNeHOZAXIGTWOzOM,1248
313
313
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=dKXAOTmMI3YmC5u7PeHC9AaZmlL6ft7CYSFQKCg0sXw,9911
314
314
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=FkByIyEy56x2_awYnxGPqGaOp7zAzpAoRkPZHKySI9M,536
315
315
  airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=kjvX7nOmUALYd7HuZHilUzgJPZ-MnZ08mtvuBnt2tQ0,618
@@ -457,9 +457,9 @@ airbyte_cdk/utils/slice_hasher.py,sha256=EDxgROHDbfG-QKQb59m7h_7crN1tRiawdf5uU7G
457
457
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=9YDJmnIGFsT51CVQf2tSSvTapGimITjEFGbUTSZAGTI,963
458
458
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
459
459
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
460
- airbyte_cdk-7.3.0.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
461
- airbyte_cdk-7.3.0.dist-info/LICENSE_SHORT,sha256=aqF6D1NcESmpn-cqsxBtszTEnHKnlsp8L4x9wAh3Nxg,55
462
- airbyte_cdk-7.3.0.dist-info/METADATA,sha256=KA6giadtbEGz9PBTe3tCmAha99is-QSuz0YULFMoxMU,6798
463
- airbyte_cdk-7.3.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
464
- airbyte_cdk-7.3.0.dist-info/entry_points.txt,sha256=eLZ2UYvJZGm1s07Pplcs--1Gim60YhZWTb53j_dghwU,195
465
- airbyte_cdk-7.3.0.dist-info/RECORD,,
460
+ airbyte_cdk-7.3.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
461
+ airbyte_cdk-7.3.1.dist-info/LICENSE_SHORT,sha256=aqF6D1NcESmpn-cqsxBtszTEnHKnlsp8L4x9wAh3Nxg,55
462
+ airbyte_cdk-7.3.1.dist-info/METADATA,sha256=_n29oKSyO6A6mUMN1c6YqHvrJRFQrXOIhhAh1E0PuXo,6798
463
+ airbyte_cdk-7.3.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
464
+ airbyte_cdk-7.3.1.dist-info/entry_points.txt,sha256=eLZ2UYvJZGm1s07Pplcs--1Gim60YhZWTb53j_dghwU,195
465
+ airbyte_cdk-7.3.1.dist-info/RECORD,,