airbyte-cdk 7.3.0__py3-none-any.whl → 7.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +54 -15
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +2 -2
- airbyte_cdk/sources/file_based/remote_file.py +40 -1
- {airbyte_cdk-7.3.0.dist-info → airbyte_cdk-7.3.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-7.3.0.dist-info → airbyte_cdk-7.3.1.dist-info}/RECORD +9 -9
- {airbyte_cdk-7.3.0.dist-info → airbyte_cdk-7.3.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-7.3.0.dist-info → airbyte_cdk-7.3.1.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-7.3.0.dist-info → airbyte_cdk-7.3.1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-7.3.0.dist-info → airbyte_cdk-7.3.1.dist-info}/entry_points.txt +0 -0
@@ -3,13 +3,15 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import logging
|
6
|
+
import time
|
6
7
|
from abc import ABC, abstractmethod
|
7
8
|
from datetime import datetime
|
8
9
|
from enum import Enum
|
9
10
|
from io import IOBase
|
10
11
|
from os import makedirs, path
|
11
|
-
from typing import Any,
|
12
|
+
from typing import Any, Iterable, List, MutableMapping, Optional, Set, Tuple
|
12
13
|
|
14
|
+
from airbyte_protocol_dataclasses.models import FailureType
|
13
15
|
from wcmatch.glob import GLOBSTAR, globmatch
|
14
16
|
|
15
17
|
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
@@ -19,8 +21,9 @@ from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import
|
|
19
21
|
preserve_directory_structure,
|
20
22
|
use_file_transfer,
|
21
23
|
)
|
24
|
+
from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
|
22
25
|
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
23
|
-
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
26
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile, UploadableRemoteFile
|
24
27
|
|
25
28
|
|
26
29
|
class FileReadMode(Enum):
|
@@ -34,6 +37,7 @@ class AbstractFileBasedStreamReader(ABC):
|
|
34
37
|
FILE_NAME = "file_name"
|
35
38
|
LOCAL_FILE_PATH = "local_file_path"
|
36
39
|
FILE_FOLDER = "file_folder"
|
40
|
+
FILE_SIZE_LIMIT = 1_500_000_000
|
37
41
|
|
38
42
|
def __init__(self) -> None:
|
39
43
|
self._config = None
|
@@ -113,16 +117,6 @@ class AbstractFileBasedStreamReader(ABC):
|
|
113
117
|
seen.add(file.uri)
|
114
118
|
yield file
|
115
119
|
|
116
|
-
@abstractmethod
|
117
|
-
def file_size(self, file: RemoteFile) -> int:
|
118
|
-
"""Utility method to get size of the remote file.
|
119
|
-
|
120
|
-
This is required for connectors that will support writing to
|
121
|
-
files. If the connector does not support writing files, then the
|
122
|
-
subclass can simply `return 0`.
|
123
|
-
"""
|
124
|
-
...
|
125
|
-
|
126
120
|
@staticmethod
|
127
121
|
def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
|
128
122
|
# Use the GLOBSTAR flag to enable recursive ** matching
|
@@ -153,9 +147,8 @@ class AbstractFileBasedStreamReader(ABC):
|
|
153
147
|
return include_identities_stream(self.config)
|
154
148
|
return False
|
155
149
|
|
156
|
-
@abstractmethod
|
157
150
|
def upload(
|
158
|
-
self, file:
|
151
|
+
self, file: UploadableRemoteFile, local_directory: str, logger: logging.Logger
|
159
152
|
) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
|
160
153
|
"""
|
161
154
|
This is required for connectors that will support writing to
|
@@ -173,7 +166,53 @@ class AbstractFileBasedStreamReader(ABC):
|
|
173
166
|
- file_size_bytes (int): The size of the referenced file in bytes.
|
174
167
|
- source_file_relative_path (str): The relative path to the referenced file in source.
|
175
168
|
"""
|
176
|
-
|
169
|
+
if not isinstance(file, UploadableRemoteFile):
|
170
|
+
raise TypeError(f"Expected UploadableRemoteFile, got {type(file)}")
|
171
|
+
|
172
|
+
file_size = file.size
|
173
|
+
|
174
|
+
if file_size > self.FILE_SIZE_LIMIT:
|
175
|
+
message = f"File size exceeds the {self.FILE_SIZE_LIMIT / 1e9} GB limit."
|
176
|
+
raise FileSizeLimitError(
|
177
|
+
message=message, internal_message=message, failure_type=FailureType.config_error
|
178
|
+
)
|
179
|
+
|
180
|
+
file_paths = self._get_file_transfer_paths(
|
181
|
+
source_file_relative_path=file.source_file_relative_path,
|
182
|
+
staging_directory=local_directory,
|
183
|
+
)
|
184
|
+
local_file_path = file_paths[self.LOCAL_FILE_PATH]
|
185
|
+
file_relative_path = file_paths[self.FILE_RELATIVE_PATH]
|
186
|
+
file_name = file_paths[self.FILE_NAME]
|
187
|
+
|
188
|
+
logger.info(
|
189
|
+
f"Starting to download the file {file.file_uri_for_logging} with size: {file_size / (1024 * 1024):,.2f} MB ({file_size / (1024 * 1024 * 1024):.2f} GB)"
|
190
|
+
)
|
191
|
+
start_download_time = time.time()
|
192
|
+
|
193
|
+
file.download_to_local_directory(local_file_path)
|
194
|
+
|
195
|
+
write_duration = time.time() - start_download_time
|
196
|
+
logger.info(
|
197
|
+
f"Finished downloading the file {file.file_uri_for_logging} and saved to {local_file_path} in {write_duration:,.2f} seconds."
|
198
|
+
)
|
199
|
+
|
200
|
+
file_record_data = FileRecordData(
|
201
|
+
folder=file_paths[self.FILE_FOLDER],
|
202
|
+
file_name=file_name,
|
203
|
+
bytes=file_size,
|
204
|
+
id=file.id,
|
205
|
+
mime_type=file.mime_type,
|
206
|
+
created_at=file.created_at,
|
207
|
+
updated_at=file.updated_at,
|
208
|
+
source_uri=file.uri,
|
209
|
+
)
|
210
|
+
file_reference = AirbyteRecordMessageFileReference(
|
211
|
+
staging_file_url=local_file_path,
|
212
|
+
source_file_relative_path=file_relative_path,
|
213
|
+
file_size_bytes=file_size,
|
214
|
+
)
|
215
|
+
return file_record_data, file_reference
|
177
216
|
|
178
217
|
def _get_file_transfer_paths(
|
179
218
|
self, source_file_relative_path: str, staging_directory: str
|
@@ -7,7 +7,7 @@ from typing import Iterable, Tuple
|
|
7
7
|
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
8
8
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
9
9
|
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
10
|
-
from airbyte_cdk.sources.file_based.remote_file import
|
10
|
+
from airbyte_cdk.sources.file_based.remote_file import UploadableRemoteFile
|
11
11
|
from airbyte_cdk.sources.utils.files_directory import get_files_directory
|
12
12
|
|
13
13
|
|
@@ -17,7 +17,7 @@ class FileTransfer:
|
|
17
17
|
|
18
18
|
def upload(
|
19
19
|
self,
|
20
|
-
file:
|
20
|
+
file: UploadableRemoteFile,
|
21
21
|
stream_reader: AbstractFileBasedStreamReader,
|
22
22
|
logger: logging.Logger,
|
23
23
|
) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
-
|
4
|
+
from abc import ABC, abstractmethod
|
5
5
|
from datetime import datetime
|
6
6
|
from typing import Optional
|
7
7
|
|
@@ -16,3 +16,42 @@ class RemoteFile(BaseModel):
|
|
16
16
|
uri: str
|
17
17
|
last_modified: datetime
|
18
18
|
mime_type: Optional[str] = None
|
19
|
+
|
20
|
+
|
21
|
+
class UploadableRemoteFile(RemoteFile, ABC):
|
22
|
+
"""
|
23
|
+
A file in a file-based stream that supports uploading(file transferring).
|
24
|
+
"""
|
25
|
+
|
26
|
+
id: Optional[str] = None
|
27
|
+
created_at: Optional[str] = None
|
28
|
+
updated_at: Optional[str] = None
|
29
|
+
|
30
|
+
@property
|
31
|
+
@abstractmethod
|
32
|
+
def size(self) -> int:
|
33
|
+
"""
|
34
|
+
Returns the file size in bytes.
|
35
|
+
"""
|
36
|
+
...
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def download_to_local_directory(self, local_file_path: str) -> None:
|
40
|
+
"""
|
41
|
+
Download the file from remote source to local storage.
|
42
|
+
"""
|
43
|
+
...
|
44
|
+
|
45
|
+
@property
|
46
|
+
def source_file_relative_path(self) -> str:
|
47
|
+
"""
|
48
|
+
Returns the relative path of the source file.
|
49
|
+
"""
|
50
|
+
return self.uri
|
51
|
+
|
52
|
+
@property
|
53
|
+
def file_uri_for_logging(self) -> str:
|
54
|
+
"""
|
55
|
+
Returns the URI for the file being logged.
|
56
|
+
"""
|
57
|
+
return self.uri
|
@@ -298,18 +298,18 @@ airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha2
|
|
298
298
|
airbyte_cdk/sources/file_based/exceptions.py,sha256=WP0qkG6fpWoBpOyyicgp5YNE393VWyegq5qSy0v4QtM,7362
|
299
299
|
airbyte_cdk/sources/file_based/file_based_source.py,sha256=Xg8OYWnGc-OcVBglvS08uwAWGWHBhEqsBnyODIkOK-4,20051
|
300
300
|
airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py,sha256=4e7FXqQ9hueacexC0SyrZyjF8oREYHza8pKF9CgKbD8,5050
|
301
|
-
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=
|
301
|
+
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=Yg9KRXpyAtElBrUOO8oX4WHQH6k6Lk7keklrZmB5Klg,9614
|
302
302
|
airbyte_cdk/sources/file_based/file_record_data.py,sha256=Vkr5AyZzlsOezjVCLhFrm_WpymlQdolWCnFAwqLJ9Iw,453
|
303
303
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=blCLn0-2LC-ZdgcNyDEhqM2RiUvEjEBh-G4-t32ZtuM,1268
|
304
304
|
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=USEYqiICXBWpDV443VtNOCmUA-GINzY_Zah74_5w3qQ,10860
|
305
305
|
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=QlCXB-ry3np67Q_VerQEPoWDOTcPTB6Go4ydZxY9ae4,20445
|
306
306
|
airbyte_cdk/sources/file_based/file_types/excel_parser.py,sha256=BeplCq0hmojELU6bZCvvpRLpQ9us81TqbGYwrhd3INo,7188
|
307
|
-
airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=
|
307
|
+
airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=rFxWaqItBux9tPf4xU03LT6b-wDZf1QolM92mP8Diuk,1120
|
308
308
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
|
309
309
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=GwyNyxmST4RX-XpXy7xVH0D-znYWWBmGv_pVAu95oHQ,5886
|
310
310
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=XenFg5sJ-UBnIkSmsiNJRou11NO0zZXx-RXgPHMT2NA,10487
|
311
311
|
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=2TYOQl62FQPCa8otLbkDIk_j01EP3oWaKSfXGhCjCHg,19492
|
312
|
-
airbyte_cdk/sources/file_based/remote_file.py,sha256=
|
312
|
+
airbyte_cdk/sources/file_based/remote_file.py,sha256=1Afzr2WFWwjiUz8R2vNFepeI192UNeHOZAXIGTWOzOM,1248
|
313
313
|
airbyte_cdk/sources/file_based/schema_helpers.py,sha256=dKXAOTmMI3YmC5u7PeHC9AaZmlL6ft7CYSFQKCg0sXw,9911
|
314
314
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=FkByIyEy56x2_awYnxGPqGaOp7zAzpAoRkPZHKySI9M,536
|
315
315
|
airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=kjvX7nOmUALYd7HuZHilUzgJPZ-MnZ08mtvuBnt2tQ0,618
|
@@ -457,9 +457,9 @@ airbyte_cdk/utils/slice_hasher.py,sha256=EDxgROHDbfG-QKQb59m7h_7crN1tRiawdf5uU7G
|
|
457
457
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=9YDJmnIGFsT51CVQf2tSSvTapGimITjEFGbUTSZAGTI,963
|
458
458
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
459
459
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
460
|
-
airbyte_cdk-7.3.
|
461
|
-
airbyte_cdk-7.3.
|
462
|
-
airbyte_cdk-7.3.
|
463
|
-
airbyte_cdk-7.3.
|
464
|
-
airbyte_cdk-7.3.
|
465
|
-
airbyte_cdk-7.3.
|
460
|
+
airbyte_cdk-7.3.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
461
|
+
airbyte_cdk-7.3.1.dist-info/LICENSE_SHORT,sha256=aqF6D1NcESmpn-cqsxBtszTEnHKnlsp8L4x9wAh3Nxg,55
|
462
|
+
airbyte_cdk-7.3.1.dist-info/METADATA,sha256=_n29oKSyO6A6mUMN1c6YqHvrJRFQrXOIhhAh1E0PuXo,6798
|
463
|
+
airbyte_cdk-7.3.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
464
|
+
airbyte_cdk-7.3.1.dist-info/entry_points.txt,sha256=eLZ2UYvJZGm1s07Pplcs--1Gim60YhZWTb53j_dghwU,195
|
465
|
+
airbyte_cdk-7.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|