airbyte-cdk 6.18.0.dev3__py3-none-any.whl → 6.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/auth/oauth.py +26 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +52 -36
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +0 -43
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +44 -20
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +50 -58
- airbyte_cdk/sources/declarative/requesters/README.md +57 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +33 -4
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +11 -0
- airbyte_cdk/sources/file_based/exceptions.py +34 -0
- airbyte_cdk/sources/file_based/file_based_source.py +28 -5
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +18 -4
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +25 -2
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +30 -2
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +20 -4
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +34 -4
- airbyte_cdk/sources/types.py +3 -0
- {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/RECORD +21 -20
- {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/entry_points.txt +0 -0
@@ -72,8 +72,6 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
|
|
72
72
|
CsvParser,
|
73
73
|
GzipParser,
|
74
74
|
JsonLineParser,
|
75
|
-
JsonParser,
|
76
|
-
Parser,
|
77
75
|
)
|
78
76
|
from airbyte_cdk.sources.declarative.extractors import (
|
79
77
|
DpathExtractor,
|
@@ -249,9 +247,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
249
247
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
250
248
|
JsonLineParser as JsonLineParserModel,
|
251
249
|
)
|
252
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
253
|
-
JsonParser as JsonParserModel,
|
254
|
-
)
|
255
250
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
256
251
|
JwtAuthenticator as JwtAuthenticatorModel,
|
257
252
|
)
|
@@ -527,7 +522,6 @@ class ModelToComponentFactory:
|
|
527
522
|
JsonDecoderModel: self.create_json_decoder,
|
528
523
|
JsonlDecoderModel: self.create_jsonl_decoder,
|
529
524
|
JsonLineParserModel: self.create_json_line_parser,
|
530
|
-
JsonParserModel: self.create_json_parser,
|
531
525
|
GzipJsonDecoderModel: self.create_gzipjson_decoder,
|
532
526
|
GzipParserModel: self.create_gzip_parser,
|
533
527
|
KeysToLowerModel: self.create_keys_to_lower_transformation,
|
@@ -1038,17 +1032,17 @@ class ModelToComponentFactory:
|
|
1038
1032
|
self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any
|
1039
1033
|
) -> CursorPaginationStrategy:
|
1040
1034
|
if isinstance(decoder, PaginationDecoderDecorator):
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
if self._is_supported_decoder_for_pagination(inner_decoder):
|
1035
|
+
if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
|
1036
|
+
raise ValueError(
|
1037
|
+
f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1038
|
+
)
|
1047
1039
|
decoder_to_use = decoder
|
1048
1040
|
else:
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1041
|
+
if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
1042
|
+
raise ValueError(
|
1043
|
+
f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1044
|
+
)
|
1045
|
+
decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
|
1052
1046
|
|
1053
1047
|
return CursorPaginationStrategy(
|
1054
1048
|
cursor_value=model.cursor_value,
|
@@ -1521,10 +1515,11 @@ class ModelToComponentFactory:
|
|
1521
1515
|
cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None,
|
1522
1516
|
) -> Union[DefaultPaginator, PaginatorTestReadDecorator]:
|
1523
1517
|
if decoder:
|
1524
|
-
if
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1518
|
+
if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
1519
|
+
raise ValueError(
|
1520
|
+
f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1521
|
+
)
|
1522
|
+
decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
|
1528
1523
|
else:
|
1529
1524
|
decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
|
1530
1525
|
page_size_option = (
|
@@ -1753,11 +1748,6 @@ class ModelToComponentFactory:
|
|
1753
1748
|
def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
|
1754
1749
|
return JsonDecoder(parameters={})
|
1755
1750
|
|
1756
|
-
@staticmethod
|
1757
|
-
def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
|
1758
|
-
encoding = model.encoding if model.encoding else "utf-8"
|
1759
|
-
return JsonParser(encoding=encoding)
|
1760
|
-
|
1761
1751
|
@staticmethod
|
1762
1752
|
def create_jsonl_decoder(
|
1763
1753
|
model: JsonlDecoderModel, config: Config, **kwargs: Any
|
@@ -1895,15 +1885,24 @@ class ModelToComponentFactory:
|
|
1895
1885
|
expires_in_name=InterpolatedString.create(
|
1896
1886
|
model.expires_in_name or "expires_in", parameters=model.parameters or {}
|
1897
1887
|
).eval(config),
|
1888
|
+
client_id_name=InterpolatedString.create(
|
1889
|
+
model.client_id_name or "client_id", parameters=model.parameters or {}
|
1890
|
+
).eval(config),
|
1898
1891
|
client_id=InterpolatedString.create(
|
1899
1892
|
model.client_id, parameters=model.parameters or {}
|
1900
1893
|
).eval(config),
|
1894
|
+
client_secret_name=InterpolatedString.create(
|
1895
|
+
model.client_secret_name or "client_secret", parameters=model.parameters or {}
|
1896
|
+
).eval(config),
|
1901
1897
|
client_secret=InterpolatedString.create(
|
1902
1898
|
model.client_secret, parameters=model.parameters or {}
|
1903
1899
|
).eval(config),
|
1904
1900
|
access_token_config_path=model.refresh_token_updater.access_token_config_path,
|
1905
1901
|
refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path,
|
1906
1902
|
token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path,
|
1903
|
+
grant_type_name=InterpolatedString.create(
|
1904
|
+
model.grant_type_name or "grant_type", parameters=model.parameters or {}
|
1905
|
+
).eval(config),
|
1907
1906
|
grant_type=InterpolatedString.create(
|
1908
1907
|
model.grant_type or "refresh_token", parameters=model.parameters or {}
|
1909
1908
|
).eval(config),
|
@@ -1921,11 +1920,15 @@ class ModelToComponentFactory:
|
|
1921
1920
|
return DeclarativeOauth2Authenticator( # type: ignore
|
1922
1921
|
access_token_name=model.access_token_name or "access_token",
|
1923
1922
|
access_token_value=model.access_token_value,
|
1923
|
+
client_id_name=model.client_id_name or "client_id",
|
1924
1924
|
client_id=model.client_id,
|
1925
|
+
client_secret_name=model.client_secret_name or "client_secret",
|
1925
1926
|
client_secret=model.client_secret,
|
1926
1927
|
expires_in_name=model.expires_in_name or "expires_in",
|
1928
|
+
grant_type_name=model.grant_type_name or "grant_type",
|
1927
1929
|
grant_type=model.grant_type or "refresh_token",
|
1928
1930
|
refresh_request_body=model.refresh_request_body,
|
1931
|
+
refresh_token_name=model.refresh_token_name or "refresh_token",
|
1929
1932
|
refresh_token=model.refresh_token,
|
1930
1933
|
scopes=model.scopes,
|
1931
1934
|
token_expiry_date=model.token_expiry_date,
|
@@ -1937,22 +1940,22 @@ class ModelToComponentFactory:
|
|
1937
1940
|
message_repository=self._message_repository,
|
1938
1941
|
)
|
1939
1942
|
|
1943
|
+
@staticmethod
|
1940
1944
|
def create_offset_increment(
|
1941
|
-
|
1945
|
+
model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
|
1942
1946
|
) -> OffsetIncrement:
|
1943
1947
|
if isinstance(decoder, PaginationDecoderDecorator):
|
1944
|
-
|
1945
|
-
|
1946
|
-
|
1947
|
-
|
1948
|
-
|
1949
|
-
if self._is_supported_decoder_for_pagination(inner_decoder):
|
1948
|
+
if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
|
1949
|
+
raise ValueError(
|
1950
|
+
f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1951
|
+
)
|
1950
1952
|
decoder_to_use = decoder
|
1951
1953
|
else:
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1954
|
+
if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
1955
|
+
raise ValueError(
|
1956
|
+
f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
|
1957
|
+
)
|
1958
|
+
decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
|
1956
1959
|
return OffsetIncrement(
|
1957
1960
|
page_size=model.page_size,
|
1958
1961
|
config=config,
|
@@ -2297,7 +2300,7 @@ class ModelToComponentFactory:
|
|
2297
2300
|
extractor=download_extractor,
|
2298
2301
|
name=name,
|
2299
2302
|
record_filter=None,
|
2300
|
-
transformations=
|
2303
|
+
transformations=transformations,
|
2301
2304
|
schema_normalization=TypeTransformer(TransformConfig.NoTransform),
|
2302
2305
|
config=config,
|
2303
2306
|
parameters={},
|
@@ -2334,6 +2337,16 @@ class ModelToComponentFactory:
|
|
2334
2337
|
if model.delete_requester
|
2335
2338
|
else None
|
2336
2339
|
)
|
2340
|
+
url_requester = (
|
2341
|
+
self._create_component_from_model(
|
2342
|
+
model=model.url_requester,
|
2343
|
+
decoder=decoder,
|
2344
|
+
config=config,
|
2345
|
+
name=f"job extract_url - {name}",
|
2346
|
+
)
|
2347
|
+
if model.url_requester
|
2348
|
+
else None
|
2349
|
+
)
|
2337
2350
|
status_extractor = self._create_component_from_model(
|
2338
2351
|
model=model.status_extractor, decoder=decoder, config=config, name=name
|
2339
2352
|
)
|
@@ -2344,6 +2357,7 @@ class ModelToComponentFactory:
|
|
2344
2357
|
creation_requester=creation_requester,
|
2345
2358
|
polling_requester=polling_requester,
|
2346
2359
|
download_retriever=download_retriever,
|
2360
|
+
url_requester=url_requester,
|
2347
2361
|
abort_requester=abort_requester,
|
2348
2362
|
delete_requester=delete_requester,
|
2349
2363
|
status_extractor=status_extractor,
|
@@ -2541,25 +2555,3 @@ class ModelToComponentFactory:
|
|
2541
2555
|
components_mapping=components_mapping,
|
2542
2556
|
parameters=model.parameters or {},
|
2543
2557
|
)
|
2544
|
-
|
2545
|
-
_UNSUPPORTED_DECODER_ERROR = (
|
2546
|
-
"Specified decoder of {decoder_type} is not supported for pagination."
|
2547
|
-
"Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead."
|
2548
|
-
"If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`."
|
2549
|
-
)
|
2550
|
-
|
2551
|
-
def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool:
|
2552
|
-
if isinstance(decoder, (JsonDecoder, XmlDecoder)):
|
2553
|
-
return True
|
2554
|
-
elif isinstance(decoder, CompositeRawDecoder):
|
2555
|
-
return self._is_supported_parser_for_pagination(decoder.parser)
|
2556
|
-
else:
|
2557
|
-
return False
|
2558
|
-
|
2559
|
-
def _is_supported_parser_for_pagination(self, parser: Parser) -> bool:
|
2560
|
-
if isinstance(parser, JsonParser):
|
2561
|
-
return True
|
2562
|
-
elif isinstance(parser, GzipParser):
|
2563
|
-
return isinstance(parser.inner_parser, JsonParser)
|
2564
|
-
else:
|
2565
|
-
return False
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# AsyncHttpJobRepository sequence diagram
|
2
|
+
|
3
|
+
- Components marked as optional are not required and can be ignored.
|
4
|
+
- if `url_requester` is not provided, `urls_extractor` will get urls from the `polling_job_response`
|
5
|
+
- interpolation_context, e.g. `create_job_response` or `polling_job_response` can be obtained from stream_slice
|
6
|
+
|
7
|
+
|
8
|
+
```mermaid
|
9
|
+
---
|
10
|
+
title: AsyncHttpJobRepository Sequence Diagram
|
11
|
+
---
|
12
|
+
sequenceDiagram
|
13
|
+
participant AsyncHttpJobRepository as AsyncOrchestrator
|
14
|
+
participant CreationRequester as creation_requester
|
15
|
+
participant PollingRequester as polling_requester
|
16
|
+
participant UrlRequester as url_requester (Optional)
|
17
|
+
participant DownloadRetriever as download_retriever
|
18
|
+
participant AbortRequester as abort_requester (Optional)
|
19
|
+
participant DeleteRequester as delete_requester (Optional)
|
20
|
+
participant Reporting Server as Async Reporting Server
|
21
|
+
|
22
|
+
AsyncHttpJobRepository ->> CreationRequester: Initiate job creation
|
23
|
+
CreationRequester ->> Reporting Server: Create job request
|
24
|
+
Reporting Server -->> CreationRequester: Job ID response
|
25
|
+
CreationRequester -->> AsyncHttpJobRepository: Job ID
|
26
|
+
|
27
|
+
loop Poll for job status
|
28
|
+
AsyncHttpJobRepository ->> PollingRequester: Check job status
|
29
|
+
PollingRequester ->> Reporting Server: Status request (interpolation_context: `create_job_response`)
|
30
|
+
Reporting Server -->> PollingRequester: Status response
|
31
|
+
PollingRequester -->> AsyncHttpJobRepository: Job status
|
32
|
+
end
|
33
|
+
|
34
|
+
alt Status: Ready
|
35
|
+
AsyncHttpJobRepository ->> UrlRequester: Request download URLs (if applicable)
|
36
|
+
UrlRequester ->> Reporting Server: URL request (interpolation_context: `polling_job_response`)
|
37
|
+
Reporting Server -->> UrlRequester: Download URLs
|
38
|
+
UrlRequester -->> AsyncHttpJobRepository: Download URLs
|
39
|
+
|
40
|
+
AsyncHttpJobRepository ->> DownloadRetriever: Download reports
|
41
|
+
DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
|
42
|
+
Reporting Server -->> DownloadRetriever: Report data
|
43
|
+
DownloadRetriever -->> AsyncHttpJobRepository: Report data
|
44
|
+
else Status: Failed
|
45
|
+
AsyncHttpJobRepository ->> AbortRequester: Send abort request
|
46
|
+
AbortRequester ->> Reporting Server: Abort job
|
47
|
+
Reporting Server -->> AbortRequester: Abort confirmation
|
48
|
+
AbortRequester -->> AsyncHttpJobRepository: Confirmation
|
49
|
+
end
|
50
|
+
|
51
|
+
AsyncHttpJobRepository ->> DeleteRequester: Send delete job request
|
52
|
+
DeleteRequester ->> Reporting Server: Delete job
|
53
|
+
Reporting Server -->> DeleteRequester: Deletion confirmation
|
54
|
+
DeleteRequester -->> AsyncHttpJobRepository: Confirmation
|
55
|
+
|
56
|
+
|
57
|
+
```
|
@@ -31,6 +31,10 @@ LOGGER = logging.getLogger("airbyte")
|
|
31
31
|
|
32
32
|
@dataclass
|
33
33
|
class AsyncHttpJobRepository(AsyncJobRepository):
|
34
|
+
"""
|
35
|
+
See Readme file for more details about flow.
|
36
|
+
"""
|
37
|
+
|
34
38
|
creation_requester: Requester
|
35
39
|
polling_requester: Requester
|
36
40
|
download_retriever: SimpleRetriever
|
@@ -44,6 +48,9 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
44
48
|
record_extractor: RecordExtractor = field(
|
45
49
|
init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
|
46
50
|
)
|
51
|
+
url_requester: Optional[Requester] = (
|
52
|
+
None # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
|
53
|
+
)
|
47
54
|
|
48
55
|
def __post_init__(self) -> None:
|
49
56
|
self._create_job_response_by_id: Dict[str, Response] = {}
|
@@ -186,10 +193,13 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
186
193
|
|
187
194
|
"""
|
188
195
|
|
189
|
-
for url in self.
|
190
|
-
|
191
|
-
|
192
|
-
|
196
|
+
for url in self._get_download_url(job):
|
197
|
+
job_slice = job.job_parameters()
|
198
|
+
stream_slice = StreamSlice(
|
199
|
+
partition=job_slice.partition,
|
200
|
+
cursor_slice=job_slice.cursor_slice,
|
201
|
+
extra_fields={**job_slice.extra_fields, "url": url},
|
202
|
+
)
|
193
203
|
for message in self.download_retriever.read_records({}, stream_slice):
|
194
204
|
if isinstance(message, Record):
|
195
205
|
yield message.data
|
@@ -226,3 +236,22 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
226
236
|
cursor_slice={},
|
227
237
|
)
|
228
238
|
return stream_slice
|
239
|
+
|
240
|
+
def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
|
241
|
+
if not self.url_requester:
|
242
|
+
url_response = self._polling_job_response_by_id[job.api_job_id()]
|
243
|
+
else:
|
244
|
+
stream_slice: StreamSlice = StreamSlice(
|
245
|
+
partition={
|
246
|
+
"polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
|
247
|
+
},
|
248
|
+
cursor_slice={},
|
249
|
+
)
|
250
|
+
url_response = self.url_requester.send_request(stream_slice=stream_slice) # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
|
251
|
+
if not url_response:
|
252
|
+
raise AirbyteTracedException(
|
253
|
+
internal_message="Always expect a response or an exception from url_requester",
|
254
|
+
failure_type=FailureType.system_error,
|
255
|
+
)
|
256
|
+
|
257
|
+
yield from self.urls_extractor.extract_records(url_response) # type: ignore # we expect urls_extractor to always return list of strings
|
@@ -31,6 +31,17 @@ class DeliverRawFiles(BaseModel):
|
|
31
31
|
|
32
32
|
delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
|
33
33
|
|
34
|
+
preserve_directory_structure: bool = Field(
|
35
|
+
title="Preserve Sub-Directories in File Paths",
|
36
|
+
description=(
|
37
|
+
"If enabled, sends subdirectory folder structure "
|
38
|
+
"along with source file names to the destination. "
|
39
|
+
"Otherwise, files will be synced by their names only. "
|
40
|
+
"This option is ignored when file-based replication is not enabled."
|
41
|
+
),
|
42
|
+
default=True,
|
43
|
+
)
|
44
|
+
|
34
45
|
|
35
46
|
class AbstractFileBasedSpec(BaseModel):
|
36
47
|
"""
|
@@ -111,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
|
|
111
111
|
pass
|
112
112
|
|
113
113
|
|
114
|
+
class DuplicatedFilesError(BaseFileBasedSourceError):
|
115
|
+
def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
|
116
|
+
self._duplicated_files_names = duplicated_files_names
|
117
|
+
self._stream_name: str = kwargs["stream"]
|
118
|
+
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
|
119
|
+
|
120
|
+
def _format_duplicate_files_error_message(self) -> str:
|
121
|
+
duplicated_files_messages = []
|
122
|
+
for duplicated_file in self._duplicated_files_names:
|
123
|
+
for duplicated_file_name, file_paths in duplicated_file.items():
|
124
|
+
file_duplicated_message = (
|
125
|
+
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
126
|
+
+ "".join(f"\n - {file_paths}")
|
127
|
+
)
|
128
|
+
duplicated_files_messages.append(file_duplicated_message)
|
129
|
+
|
130
|
+
error_message = (
|
131
|
+
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
|
132
|
+
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
|
133
|
+
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
134
|
+
+ "\n".join(duplicated_files_messages)
|
135
|
+
)
|
136
|
+
|
137
|
+
return error_message
|
138
|
+
|
139
|
+
def __repr__(self) -> str:
|
140
|
+
"""Return a string representation of the exception."""
|
141
|
+
class_name = self.__class__.__name__
|
142
|
+
properties_str = ", ".join(
|
143
|
+
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
144
|
+
)
|
145
|
+
return f"{class_name}({properties_str})"
|
146
|
+
|
147
|
+
|
114
148
|
class CustomFileBasedException(AirbyteTracedException):
|
115
149
|
"""
|
116
150
|
A specialized exception for file-based connectors.
|
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
242
242
|
stream=self._make_default_stream(
|
243
243
|
stream_config=stream_config,
|
244
244
|
cursor=cursor,
|
245
|
-
|
245
|
+
parsed_config=parsed_config,
|
246
246
|
),
|
247
247
|
source=self,
|
248
248
|
logger=self.logger,
|
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
273
273
|
stream=self._make_default_stream(
|
274
274
|
stream_config=stream_config,
|
275
275
|
cursor=cursor,
|
276
|
-
|
276
|
+
parsed_config=parsed_config,
|
277
277
|
),
|
278
278
|
source=self,
|
279
279
|
logger=self.logger,
|
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
285
285
|
stream = self._make_default_stream(
|
286
286
|
stream_config=stream_config,
|
287
287
|
cursor=cursor,
|
288
|
-
|
288
|
+
parsed_config=parsed_config,
|
289
289
|
)
|
290
290
|
|
291
291
|
streams.append(stream)
|
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
298
298
|
self,
|
299
299
|
stream_config: FileBasedStreamConfig,
|
300
300
|
cursor: Optional[AbstractFileBasedCursor],
|
301
|
-
|
301
|
+
parsed_config: AbstractFileBasedSpec,
|
302
302
|
) -> AbstractFileBasedStream:
|
303
303
|
return DefaultFileBasedStream(
|
304
304
|
config=stream_config,
|
@@ -310,7 +310,8 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
310
310
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
311
311
|
errors_collector=self.errors_collector,
|
312
312
|
cursor=cursor,
|
313
|
-
use_file_transfer=
|
313
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
314
|
+
preserve_directory_structure=self._preserve_directory_structure(parsed_config),
|
314
315
|
)
|
315
316
|
|
316
317
|
def _get_stream_from_catalog(
|
@@ -385,3 +386,25 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
385
386
|
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
386
387
|
)
|
387
388
|
return use_file_transfer
|
389
|
+
|
390
|
+
@staticmethod
|
391
|
+
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
392
|
+
"""
|
393
|
+
Determines whether to preserve directory structure during file transfer.
|
394
|
+
|
395
|
+
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
+
When disabled, files are flattened to the root of the destination.
|
397
|
+
|
398
|
+
Args:
|
399
|
+
parsed_config: The parsed configuration containing delivery method settings
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
True if directory structure should be preserved (default), False otherwise
|
403
|
+
"""
|
404
|
+
if (
|
405
|
+
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
+
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
|
407
|
+
and parsed_config.delivery_method.preserve_directory_structure is not None
|
408
|
+
):
|
409
|
+
return parsed_config.delivery_method.preserve_directory_structure
|
410
|
+
return True
|
@@ -135,6 +135,17 @@ class AbstractFileBasedStreamReader(ABC):
|
|
135
135
|
return use_file_transfer
|
136
136
|
return False
|
137
137
|
|
138
|
+
def preserve_directory_structure(self) -> bool:
|
139
|
+
# fall back to preserve subdirectories if config is not present or incomplete
|
140
|
+
if (
|
141
|
+
self.use_file_transfer()
|
142
|
+
and self.config
|
143
|
+
and hasattr(self.config.delivery_method, "preserve_directory_structure")
|
144
|
+
and self.config.delivery_method.preserve_directory_structure is not None
|
145
|
+
):
|
146
|
+
return self.config.delivery_method.preserve_directory_structure
|
147
|
+
return True
|
148
|
+
|
138
149
|
@abstractmethod
|
139
150
|
def get_file(
|
140
151
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
@@ -159,10 +170,13 @@ class AbstractFileBasedStreamReader(ABC):
|
|
159
170
|
"""
|
160
171
|
...
|
161
172
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
173
|
+
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
|
174
|
+
preserve_directory_structure = self.preserve_directory_structure()
|
175
|
+
if preserve_directory_structure:
|
176
|
+
# Remove left slashes from source path format to make relative path for writing locally
|
177
|
+
file_relative_path = file.uri.lstrip("/")
|
178
|
+
else:
|
179
|
+
file_relative_path = path.basename(file.uri)
|
166
180
|
local_file_path = path.join(local_directory, file_relative_path)
|
167
181
|
|
168
182
|
# Ensure the local directory exists
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
+
import os
|
5
6
|
import traceback
|
6
7
|
from datetime import datetime
|
7
8
|
from io import BytesIO, IOBase
|
@@ -42,12 +43,34 @@ unstructured_partition_pdf = None
|
|
42
43
|
unstructured_partition_docx = None
|
43
44
|
unstructured_partition_pptx = None
|
44
45
|
|
46
|
+
AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
|
47
|
+
TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
|
48
|
+
|
49
|
+
|
50
|
+
def get_nltk_temp_folder() -> str:
|
51
|
+
"""
|
52
|
+
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
|
53
|
+
It's safe to use /airbyte for now. Fallback to /tmp for local development.
|
54
|
+
"""
|
55
|
+
try:
|
56
|
+
nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
|
57
|
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
58
|
+
except OSError:
|
59
|
+
nltk_data_dir = TMP_NLTK_DATA_DIR
|
60
|
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
61
|
+
return nltk_data_dir
|
62
|
+
|
63
|
+
|
45
64
|
try:
|
65
|
+
nltk_data_dir = get_nltk_temp_folder()
|
66
|
+
nltk.data.path.append(nltk_data_dir)
|
46
67
|
nltk.data.find("tokenizers/punkt.zip")
|
47
68
|
nltk.data.find("tokenizers/punkt_tab.zip")
|
69
|
+
nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
|
48
70
|
except LookupError:
|
49
|
-
nltk.download("punkt")
|
50
|
-
nltk.download("punkt_tab")
|
71
|
+
nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
|
72
|
+
nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
|
73
|
+
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
|
51
74
|
|
52
75
|
|
53
76
|
def optional_decode(contents: Union[str, bytes]) -> str:
|
@@ -5,14 +5,17 @@
|
|
5
5
|
import asyncio
|
6
6
|
import itertools
|
7
7
|
import traceback
|
8
|
+
from collections import defaultdict
|
8
9
|
from copy import deepcopy
|
9
10
|
from functools import cache
|
10
|
-
from
|
11
|
+
from os import path
|
12
|
+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
|
11
13
|
|
12
14
|
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
|
13
15
|
from airbyte_cdk.models import Type as MessageType
|
14
16
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
15
17
|
from airbyte_cdk.sources.file_based.exceptions import (
|
18
|
+
DuplicatedFilesError,
|
16
19
|
FileBasedSourceError,
|
17
20
|
InvalidSchemaError,
|
18
21
|
MissingSchemaError,
|
@@ -43,6 +46,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
43
46
|
"""
|
44
47
|
|
45
48
|
FILE_TRANSFER_KW = "use_file_transfer"
|
49
|
+
PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
|
50
|
+
FILES_KEY = "files"
|
46
51
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
47
52
|
ab_last_mod_col = "_ab_source_file_last_modified"
|
48
53
|
ab_file_name_col = "_ab_source_file_url"
|
@@ -50,10 +55,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
50
55
|
source_file_url = "source_file_url"
|
51
56
|
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
52
57
|
use_file_transfer = False
|
58
|
+
preserve_directory_structure = True
|
53
59
|
|
54
60
|
def __init__(self, **kwargs: Any):
|
55
61
|
if self.FILE_TRANSFER_KW in kwargs:
|
56
62
|
self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
|
63
|
+
if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
|
64
|
+
self.preserve_directory_structure = kwargs.pop(
|
65
|
+
self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
|
66
|
+
)
|
57
67
|
super().__init__(**kwargs)
|
58
68
|
|
59
69
|
@property
|
@@ -98,15 +108,33 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
98
108
|
else:
|
99
109
|
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
100
110
|
|
111
|
+
def _duplicated_files_names(
|
112
|
+
self, slices: List[dict[str, List[RemoteFile]]]
|
113
|
+
) -> List[dict[str, List[str]]]:
|
114
|
+
seen_file_names: Dict[str, List[str]] = defaultdict(list)
|
115
|
+
for file_slice in slices:
|
116
|
+
for file_found in file_slice[self.FILES_KEY]:
|
117
|
+
file_name = path.basename(file_found.uri)
|
118
|
+
seen_file_names[file_name].append(file_found.uri)
|
119
|
+
return [
|
120
|
+
{file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
|
121
|
+
]
|
122
|
+
|
101
123
|
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
102
124
|
# Sort files by last_modified, uri and return them grouped by last_modified
|
103
125
|
all_files = self.list_files()
|
104
126
|
files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
|
105
127
|
sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
|
106
128
|
slices = [
|
107
|
-
{
|
129
|
+
{self.FILES_KEY: list(group[1])}
|
108
130
|
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
109
131
|
]
|
132
|
+
if slices and not self.preserve_directory_structure:
|
133
|
+
duplicated_files_names = self._duplicated_files_names(slices)
|
134
|
+
if duplicated_files_names:
|
135
|
+
raise DuplicatedFilesError(
|
136
|
+
stream=self.name, duplicated_files_names=duplicated_files_names
|
137
|
+
)
|
110
138
|
return slices
|
111
139
|
|
112
140
|
def transform_record(
|