airbyte-cdk 6.13.1.dev4109__py3-none-any.whl → 6.13.1.dev41012__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/entrypoint.py +1 -13
- airbyte_cdk/sources/declarative/auth/oauth.py +0 -26
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +51 -24
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +20 -128
- airbyte_cdk/sources/declarative/extractors/__init__.py +0 -2
- airbyte_cdk/sources/declarative/extractors/record_selector.py +7 -5
- airbyte_cdk/sources/declarative/interpolation/macros.py +0 -21
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +11 -97
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +14 -71
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +4 -33
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +35 -52
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +7 -10
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +4 -9
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +6 -11
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +11 -13
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +13 -14
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +7 -6
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +7 -10
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +4 -1
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +64 -71
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -4
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +1 -3
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +3 -8
- airbyte_cdk/sources/file_based/exceptions.py +23 -31
- airbyte_cdk/sources/file_based/file_based_source.py +8 -17
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +6 -7
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +2 -25
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +20 -10
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +4 -20
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +4 -34
- airbyte_cdk/sources/types.py +0 -3
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/RECORD +36 -39
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/WHEEL +1 -1
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +0 -55
- airbyte_cdk/sources/declarative/requesters/README.md +0 -57
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +0 -61
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,18 @@ import json
|
|
6
6
|
from dataclasses import InitVar, dataclass, field
|
7
7
|
from functools import partial
|
8
8
|
from itertools import islice
|
9
|
-
from typing import
|
9
|
+
from typing import (
|
10
|
+
Any,
|
11
|
+
Callable,
|
12
|
+
Iterable,
|
13
|
+
List,
|
14
|
+
Mapping,
|
15
|
+
MutableMapping,
|
16
|
+
Optional,
|
17
|
+
Set,
|
18
|
+
Tuple,
|
19
|
+
Union,
|
20
|
+
)
|
10
21
|
|
11
22
|
import requests
|
12
23
|
|
@@ -79,6 +90,9 @@ class SimpleRetriever(Retriever):
|
|
79
90
|
|
80
91
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
81
92
|
self._paginator = self.paginator or NoPagination(parameters=parameters)
|
93
|
+
self._last_response: Optional[requests.Response] = None
|
94
|
+
self._last_page_size: int = 0
|
95
|
+
self._last_record: Optional[Record] = None
|
82
96
|
self._parameters = parameters
|
83
97
|
self._name = (
|
84
98
|
InterpolatedString(self._name, parameters=parameters)
|
@@ -86,6 +100,10 @@ class SimpleRetriever(Retriever):
|
|
86
100
|
else self._name
|
87
101
|
)
|
88
102
|
|
103
|
+
# This mapping is used during a resumable full refresh syncs to indicate whether a partition has started syncing
|
104
|
+
# records. Partitions serve as the key and map to True if they already began processing records
|
105
|
+
self._partition_started: MutableMapping[Any, bool] = dict()
|
106
|
+
|
89
107
|
@property # type: ignore
|
90
108
|
def name(self) -> str:
|
91
109
|
"""
|
@@ -233,13 +251,17 @@ class SimpleRetriever(Retriever):
|
|
233
251
|
raise ValueError("Request body json cannot be a string")
|
234
252
|
return body_json
|
235
253
|
|
236
|
-
def _paginator_path(
|
254
|
+
def _paginator_path(
|
255
|
+
self,
|
256
|
+
) -> Optional[str]:
|
237
257
|
"""
|
238
258
|
If the paginator points to a path, follow it, else return nothing so the requester is used.
|
259
|
+
:param stream_state:
|
260
|
+
:param stream_slice:
|
239
261
|
:param next_page_token:
|
240
262
|
:return:
|
241
263
|
"""
|
242
|
-
return self._paginator.path(
|
264
|
+
return self._paginator.path()
|
243
265
|
|
244
266
|
def _parse_response(
|
245
267
|
self,
|
@@ -250,15 +272,22 @@ class SimpleRetriever(Retriever):
|
|
250
272
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
251
273
|
) -> Iterable[Record]:
|
252
274
|
if not response:
|
275
|
+
self._last_response = None
|
253
276
|
yield from []
|
254
277
|
else:
|
255
|
-
|
278
|
+
self._last_response = response
|
279
|
+
record_generator = self.record_selector.select_records(
|
256
280
|
response=response,
|
257
281
|
stream_state=stream_state,
|
258
282
|
records_schema=records_schema,
|
259
283
|
stream_slice=stream_slice,
|
260
284
|
next_page_token=next_page_token,
|
261
285
|
)
|
286
|
+
self._last_page_size = 0
|
287
|
+
for record in record_generator:
|
288
|
+
self._last_page_size += 1
|
289
|
+
self._last_record = record
|
290
|
+
yield record
|
262
291
|
|
263
292
|
@property # type: ignore
|
264
293
|
def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
|
@@ -270,13 +299,7 @@ class SimpleRetriever(Retriever):
|
|
270
299
|
if not isinstance(value, property):
|
271
300
|
self._primary_key = value
|
272
301
|
|
273
|
-
def _next_page_token(
|
274
|
-
self,
|
275
|
-
response: requests.Response,
|
276
|
-
last_page_size: int,
|
277
|
-
last_record: Optional[Record],
|
278
|
-
last_page_token_value: Optional[Any],
|
279
|
-
) -> Optional[Mapping[str, Any]]:
|
302
|
+
def _next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
|
280
303
|
"""
|
281
304
|
Specifies a pagination strategy.
|
282
305
|
|
@@ -284,12 +307,7 @@ class SimpleRetriever(Retriever):
|
|
284
307
|
|
285
308
|
:return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
|
286
309
|
"""
|
287
|
-
return self._paginator.next_page_token(
|
288
|
-
response=response,
|
289
|
-
last_page_size=last_page_size,
|
290
|
-
last_record=last_record,
|
291
|
-
last_page_token_value=last_page_token_value,
|
292
|
-
)
|
310
|
+
return self._paginator.next_page_token(response, self._last_page_size, self._last_record)
|
293
311
|
|
294
312
|
def _fetch_next_page(
|
295
313
|
self,
|
@@ -298,7 +316,7 @@ class SimpleRetriever(Retriever):
|
|
298
316
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
299
317
|
) -> Optional[requests.Response]:
|
300
318
|
return self.requester.send_request(
|
301
|
-
path=self._paginator_path(
|
319
|
+
path=self._paginator_path(),
|
302
320
|
stream_state=stream_state,
|
303
321
|
stream_slice=stream_slice,
|
304
322
|
next_page_token=next_page_token,
|
@@ -327,37 +345,20 @@ class SimpleRetriever(Retriever):
|
|
327
345
|
# This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well.
|
328
346
|
def _read_pages(
|
329
347
|
self,
|
330
|
-
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[
|
348
|
+
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
|
331
349
|
stream_state: Mapping[str, Any],
|
332
350
|
stream_slice: StreamSlice,
|
333
|
-
) -> Iterable[
|
351
|
+
) -> Iterable[StreamData]:
|
334
352
|
pagination_complete = False
|
335
|
-
|
336
|
-
next_page_token: Optional[Mapping[str, Any]] = (
|
337
|
-
{"next_page_token": initial_token} if initial_token else None
|
338
|
-
)
|
353
|
+
next_page_token = None
|
339
354
|
while not pagination_complete:
|
340
355
|
response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
|
341
|
-
|
342
|
-
last_page_size = 0
|
343
|
-
last_record: Optional[Record] = None
|
344
|
-
for record in records_generator_fn(response):
|
345
|
-
last_page_size += 1
|
346
|
-
last_record = record
|
347
|
-
yield record
|
356
|
+
yield from records_generator_fn(response)
|
348
357
|
|
349
358
|
if not response:
|
350
359
|
pagination_complete = True
|
351
360
|
else:
|
352
|
-
|
353
|
-
next_page_token.get("next_page_token") if next_page_token else None
|
354
|
-
)
|
355
|
-
next_page_token = self._next_page_token(
|
356
|
-
response=response,
|
357
|
-
last_page_size=last_page_size,
|
358
|
-
last_record=last_record,
|
359
|
-
last_page_token_value=last_page_token_value,
|
360
|
-
)
|
361
|
+
next_page_token = self._next_page_token(response)
|
361
362
|
if not next_page_token:
|
362
363
|
pagination_complete = True
|
363
364
|
|
@@ -366,38 +367,19 @@ class SimpleRetriever(Retriever):
|
|
366
367
|
|
367
368
|
def _read_single_page(
|
368
369
|
self,
|
369
|
-
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[
|
370
|
+
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
|
370
371
|
stream_state: Mapping[str, Any],
|
371
372
|
stream_slice: StreamSlice,
|
372
373
|
) -> Iterable[StreamData]:
|
373
|
-
|
374
|
-
|
375
|
-
initial_token = self._paginator.get_initial_token()
|
376
|
-
next_page_token: Optional[Mapping[str, Any]] = (
|
377
|
-
{"next_page_token": initial_token} if initial_token else None
|
378
|
-
)
|
379
|
-
|
380
|
-
response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
|
381
|
-
|
382
|
-
last_page_size = 0
|
383
|
-
last_record: Optional[Record] = None
|
384
|
-
for record in records_generator_fn(response):
|
385
|
-
last_page_size += 1
|
386
|
-
last_record = record
|
387
|
-
yield record
|
374
|
+
response = self._fetch_next_page(stream_state, stream_slice)
|
375
|
+
yield from records_generator_fn(response)
|
388
376
|
|
389
377
|
if not response:
|
390
|
-
next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
378
|
+
next_page_token: Mapping[str, Any] = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
391
379
|
else:
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
next_page_token = self._next_page_token(
|
396
|
-
response=response,
|
397
|
-
last_page_size=last_page_size,
|
398
|
-
last_record=last_record,
|
399
|
-
last_page_token_value=last_page_token_value,
|
400
|
-
) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
380
|
+
next_page_token = self._next_page_token(response) or {
|
381
|
+
FULL_REFRESH_SYNC_COMPLETE_KEY: True
|
382
|
+
}
|
401
383
|
|
402
384
|
if self.cursor:
|
403
385
|
self.cursor.close_slice(
|
@@ -432,14 +414,25 @@ class SimpleRetriever(Retriever):
|
|
432
414
|
if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
|
433
415
|
stream_state = self.state
|
434
416
|
|
435
|
-
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
436
|
-
#
|
437
|
-
#
|
417
|
+
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to fetch more records
|
418
|
+
# The platform deletes stream state for full refresh streams before starting a new job, so we don't need to worry about
|
419
|
+
# this value existing for the initial attempt
|
438
420
|
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
439
421
|
return
|
422
|
+
cursor_value = stream_state.get("next_page_token")
|
423
|
+
|
424
|
+
# The first attempt to read a page for the current partition should reset the paginator to the current
|
425
|
+
# cursor state which is initially assigned to the incoming state from the platform
|
426
|
+
partition_key = self._to_partition_key(_slice.partition)
|
427
|
+
if partition_key not in self._partition_started:
|
428
|
+
self._partition_started[partition_key] = True
|
429
|
+
self._paginator.reset(reset_value=cursor_value)
|
440
430
|
|
441
431
|
yield from self._read_single_page(record_generator, stream_state, _slice)
|
442
432
|
else:
|
433
|
+
# Fixing paginator types has a long tail of dependencies
|
434
|
+
self._paginator.reset()
|
435
|
+
|
443
436
|
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
444
437
|
current_record = self._extract_record(stream_data, _slice)
|
445
438
|
if self.cursor and current_record:
|
@@ -525,7 +518,7 @@ class SimpleRetriever(Retriever):
|
|
525
518
|
stream_state: Mapping[str, Any],
|
526
519
|
records_schema: Mapping[str, Any],
|
527
520
|
stream_slice: Optional[StreamSlice],
|
528
|
-
) -> Iterable[
|
521
|
+
) -> Iterable[StreamData]:
|
529
522
|
yield from self._parse_response(
|
530
523
|
response,
|
531
524
|
stream_slice=stream_slice,
|
@@ -569,7 +562,7 @@ class SimpleRetrieverTestReadDecorator(SimpleRetriever):
|
|
569
562
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
570
563
|
) -> Optional[requests.Response]:
|
571
564
|
return self.requester.send_request(
|
572
|
-
path=self._paginator_path(
|
565
|
+
path=self._paginator_path(),
|
573
566
|
stream_state=stream_state,
|
574
567
|
stream_slice=stream_slice,
|
575
568
|
next_page_token=next_page_token,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
2
|
|
3
|
-
from typing import Any, Iterable, Mapping, Optional
|
3
|
+
from typing import Any, Callable, Iterable, Mapping, Optional
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
6
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -16,7 +16,7 @@ class DeclarativePartitionFactory:
|
|
16
16
|
self,
|
17
17
|
stream_name: str,
|
18
18
|
json_schema: Mapping[str, Any],
|
19
|
-
|
19
|
+
retriever_factory: Callable[[], Retriever],
|
20
20
|
message_repository: MessageRepository,
|
21
21
|
) -> None:
|
22
22
|
"""
|
@@ -26,14 +26,14 @@ class DeclarativePartitionFactory:
|
|
26
26
|
"""
|
27
27
|
self._stream_name = stream_name
|
28
28
|
self._json_schema = json_schema
|
29
|
-
self.
|
29
|
+
self._retriever_factory = retriever_factory
|
30
30
|
self._message_repository = message_repository
|
31
31
|
|
32
32
|
def create(self, stream_slice: StreamSlice) -> Partition:
|
33
33
|
return DeclarativePartition(
|
34
34
|
self._stream_name,
|
35
35
|
self._json_schema,
|
36
|
-
self.
|
36
|
+
self._retriever_factory(),
|
37
37
|
self._message_repository,
|
38
38
|
stream_slice,
|
39
39
|
)
|
@@ -11,8 +11,6 @@ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
|
11
11
|
|
12
12
|
@dataclass
|
13
13
|
class FlattenFields(RecordTransformation):
|
14
|
-
flatten_lists: bool = True
|
15
|
-
|
16
14
|
def transform(
|
17
15
|
self,
|
18
16
|
record: Dict[str, Any],
|
@@ -41,7 +39,7 @@ class FlattenFields(RecordTransformation):
|
|
41
39
|
)
|
42
40
|
stack.append((value, new_key))
|
43
41
|
|
44
|
-
elif isinstance(current_record, list)
|
42
|
+
elif isinstance(current_record, list):
|
45
43
|
for i, item in enumerate(current_record):
|
46
44
|
force_with_parent_name = True
|
47
45
|
stack.append((item, f"{parent_key}.{i}"))
|
@@ -31,14 +31,9 @@ class DeliverRawFiles(BaseModel):
|
|
31
31
|
|
32
32
|
delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
|
33
33
|
|
34
|
-
|
35
|
-
title="Preserve
|
36
|
-
description=
|
37
|
-
"If enabled, sends subdirectory folder structure "
|
38
|
-
"along with source file names to the destination. "
|
39
|
-
"Otherwise, files will be synced by their names only. "
|
40
|
-
"This option is ignored when file-based replication is not enabled."
|
41
|
-
),
|
34
|
+
preserve_subdirectories_directories: bool = Field(
|
35
|
+
title="Preserve Subdirectories in File Paths",
|
36
|
+
description="If enabled replicate source folder structure",
|
42
37
|
default=True,
|
43
38
|
)
|
44
39
|
|
@@ -112,37 +112,7 @@ class ErrorListingFiles(BaseFileBasedSourceError):
|
|
112
112
|
|
113
113
|
|
114
114
|
class DuplicatedFilesError(BaseFileBasedSourceError):
|
115
|
-
|
116
|
-
self._duplicated_files_names = duplicated_files_names
|
117
|
-
self._stream_name: str = kwargs["stream"]
|
118
|
-
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
|
119
|
-
|
120
|
-
def _format_duplicate_files_error_message(self) -> str:
|
121
|
-
duplicated_files_messages = []
|
122
|
-
for duplicated_file in self._duplicated_files_names:
|
123
|
-
for duplicated_file_name, file_paths in duplicated_file.items():
|
124
|
-
file_duplicated_message = (
|
125
|
-
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
126
|
-
+ "".join(f"\n - {file_paths}")
|
127
|
-
)
|
128
|
-
duplicated_files_messages.append(file_duplicated_message)
|
129
|
-
|
130
|
-
error_message = (
|
131
|
-
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
|
132
|
-
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
|
133
|
-
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
134
|
-
+ "\n".join(duplicated_files_messages)
|
135
|
-
)
|
136
|
-
|
137
|
-
return error_message
|
138
|
-
|
139
|
-
def __repr__(self) -> str:
|
140
|
-
"""Return a string representation of the exception."""
|
141
|
-
class_name = self.__class__.__name__
|
142
|
-
properties_str = ", ".join(
|
143
|
-
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
144
|
-
)
|
145
|
-
return f"{class_name}({properties_str})"
|
115
|
+
pass
|
146
116
|
|
147
117
|
|
148
118
|
class CustomFileBasedException(AirbyteTracedException):
|
@@ -157,3 +127,25 @@ class CustomFileBasedException(AirbyteTracedException):
|
|
157
127
|
|
158
128
|
class FileSizeLimitError(CustomFileBasedException):
|
159
129
|
pass
|
130
|
+
|
131
|
+
|
132
|
+
def format_duplicate_files_error_message(
|
133
|
+
stream_name: str, duplicated_files_names: List[dict[str, List[str]]]
|
134
|
+
) -> str:
|
135
|
+
duplicated_files_messages = []
|
136
|
+
for duplicated_file in duplicated_files_names:
|
137
|
+
for duplicated_file_name, file_paths in duplicated_file.items():
|
138
|
+
file_duplicated_message = (
|
139
|
+
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
140
|
+
+ "".join(f"\n - {file_paths}")
|
141
|
+
)
|
142
|
+
duplicated_files_messages.append(file_duplicated_message)
|
143
|
+
|
144
|
+
error_message = (
|
145
|
+
f"ERROR: Duplicate filenames found for stream {stream_name}. "
|
146
|
+
"Duplicate file names are not allowed if the Preserve Subdirectories in File Paths option is disabled. "
|
147
|
+
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
148
|
+
+ "\n".join(duplicated_files_messages)
|
149
|
+
)
|
150
|
+
|
151
|
+
return error_message
|
@@ -311,7 +311,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
311
311
|
errors_collector=self.errors_collector,
|
312
312
|
cursor=cursor,
|
313
313
|
use_file_transfer=self._use_file_transfer(parsed_config),
|
314
|
-
|
314
|
+
preserve_subdirectories_directories=self._preserve_subdirectories_directories(
|
315
|
+
parsed_config
|
316
|
+
),
|
315
317
|
)
|
316
318
|
|
317
319
|
def _get_stream_from_catalog(
|
@@ -388,23 +390,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
388
390
|
return use_file_transfer
|
389
391
|
|
390
392
|
@staticmethod
|
391
|
-
def
|
392
|
-
|
393
|
-
Determines whether to preserve directory structure during file transfer.
|
394
|
-
|
395
|
-
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
-
When disabled, files are flattened to the root of the destination.
|
397
|
-
|
398
|
-
Args:
|
399
|
-
parsed_config: The parsed configuration containing delivery method settings
|
400
|
-
|
401
|
-
Returns:
|
402
|
-
True if directory structure should be preserved (default), False otherwise
|
403
|
-
"""
|
393
|
+
def _preserve_subdirectories_directories(parsed_config: AbstractFileBasedSpec) -> bool:
|
394
|
+
# fall back to preserve subdirectories if config is not present or incomplete
|
404
395
|
if (
|
405
396
|
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
-
and hasattr(parsed_config.delivery_method, "
|
407
|
-
and parsed_config.delivery_method.
|
397
|
+
and hasattr(parsed_config.delivery_method, "preserve_subdirectories_directories")
|
398
|
+
and parsed_config.delivery_method.preserve_subdirectories_directories is not None
|
408
399
|
):
|
409
|
-
return parsed_config.delivery_method.
|
400
|
+
return parsed_config.delivery_method.preserve_subdirectories_directories
|
410
401
|
return True
|
@@ -135,15 +135,14 @@ class AbstractFileBasedStreamReader(ABC):
|
|
135
135
|
return use_file_transfer
|
136
136
|
return False
|
137
137
|
|
138
|
-
def
|
138
|
+
def preserve_subdirectories_directories(self) -> bool:
|
139
139
|
# fall back to preserve subdirectories if config is not present or incomplete
|
140
140
|
if (
|
141
141
|
self.use_file_transfer()
|
142
|
-
and self.config
|
143
|
-
and
|
144
|
-
and self.config.delivery_method.preserve_directory_structure is not None
|
142
|
+
and hasattr(self.config.delivery_method, "preserve_subdirectories_directories")
|
143
|
+
and self.config.delivery_method.preserve_subdirectories_directories is not None
|
145
144
|
):
|
146
|
-
return self.config.delivery_method.
|
145
|
+
return self.config.delivery_method.preserve_subdirectories_directories
|
147
146
|
return True
|
148
147
|
|
149
148
|
@abstractmethod
|
@@ -171,8 +170,8 @@ class AbstractFileBasedStreamReader(ABC):
|
|
171
170
|
...
|
172
171
|
|
173
172
|
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
|
174
|
-
|
175
|
-
if
|
173
|
+
preserve_subdirectories_directories = self.preserve_subdirectories_directories()
|
174
|
+
if preserve_subdirectories_directories:
|
176
175
|
# Remove left slashes from source path format to make relative path for writing locally
|
177
176
|
file_relative_path = file.uri.lstrip("/")
|
178
177
|
else:
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
import os
|
6
5
|
import traceback
|
7
6
|
from datetime import datetime
|
8
7
|
from io import BytesIO, IOBase
|
@@ -43,34 +42,12 @@ unstructured_partition_pdf = None
|
|
43
42
|
unstructured_partition_docx = None
|
44
43
|
unstructured_partition_pptx = None
|
45
44
|
|
46
|
-
AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
|
47
|
-
TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
|
48
|
-
|
49
|
-
|
50
|
-
def get_nltk_temp_folder() -> str:
|
51
|
-
"""
|
52
|
-
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
|
53
|
-
It's safe to use /airbyte for now. Fallback to /tmp for local development.
|
54
|
-
"""
|
55
|
-
try:
|
56
|
-
nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
|
57
|
-
os.makedirs(nltk_data_dir, exist_ok=True)
|
58
|
-
except OSError:
|
59
|
-
nltk_data_dir = TMP_NLTK_DATA_DIR
|
60
|
-
os.makedirs(nltk_data_dir, exist_ok=True)
|
61
|
-
return nltk_data_dir
|
62
|
-
|
63
|
-
|
64
45
|
try:
|
65
|
-
nltk_data_dir = get_nltk_temp_folder()
|
66
|
-
nltk.data.path.append(nltk_data_dir)
|
67
46
|
nltk.data.find("tokenizers/punkt.zip")
|
68
47
|
nltk.data.find("tokenizers/punkt_tab.zip")
|
69
|
-
nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
|
70
48
|
except LookupError:
|
71
|
-
nltk.download("punkt"
|
72
|
-
nltk.download("punkt_tab"
|
73
|
-
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
|
49
|
+
nltk.download("punkt")
|
50
|
+
nltk.download("punkt_tab")
|
74
51
|
|
75
52
|
|
76
53
|
def optional_decode(contents: Union[str, bytes]) -> str:
|
@@ -22,6 +22,7 @@ from airbyte_cdk.sources.file_based.exceptions import (
|
|
22
22
|
RecordParseError,
|
23
23
|
SchemaInferenceError,
|
24
24
|
StopSyncPerValidationPolicy,
|
25
|
+
format_duplicate_files_error_message,
|
25
26
|
)
|
26
27
|
from airbyte_cdk.sources.file_based.file_types import FileTransfer
|
27
28
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -46,7 +47,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
46
47
|
"""
|
47
48
|
|
48
49
|
FILE_TRANSFER_KW = "use_file_transfer"
|
49
|
-
|
50
|
+
PRESERVE_SUBDIRECTORIES_KW = "preserve_subdirectories_directories"
|
50
51
|
FILES_KEY = "files"
|
51
52
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
52
53
|
ab_last_mod_col = "_ab_source_file_last_modified"
|
@@ -55,14 +56,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
55
56
|
source_file_url = "source_file_url"
|
56
57
|
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
57
58
|
use_file_transfer = False
|
58
|
-
|
59
|
+
preserve_subdirectories_directories = True
|
59
60
|
|
60
61
|
def __init__(self, **kwargs: Any):
|
61
62
|
if self.FILE_TRANSFER_KW in kwargs:
|
62
63
|
self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
|
63
|
-
|
64
|
-
|
65
|
-
self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
|
64
|
+
self.preserve_subdirectories_directories = kwargs.pop(
|
65
|
+
self.PRESERVE_SUBDIRECTORIES_KW, True
|
66
66
|
)
|
67
67
|
super().__init__(**kwargs)
|
68
68
|
|
@@ -111,13 +111,20 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
111
111
|
def _duplicated_files_names(
|
112
112
|
self, slices: List[dict[str, List[RemoteFile]]]
|
113
113
|
) -> List[dict[str, List[str]]]:
|
114
|
-
seen_file_names
|
114
|
+
seen_file_names = set()
|
115
|
+
duplicates_file_names = set()
|
116
|
+
file_paths = defaultdict(list)
|
115
117
|
for file_slice in slices:
|
116
118
|
for file_found in file_slice[self.FILES_KEY]:
|
117
119
|
file_name = path.basename(file_found.uri)
|
118
|
-
seen_file_names
|
120
|
+
if file_name not in seen_file_names:
|
121
|
+
seen_file_names.add(file_name)
|
122
|
+
else:
|
123
|
+
duplicates_file_names.add(file_name)
|
124
|
+
file_paths[file_name].append(file_found.uri)
|
119
125
|
return [
|
120
|
-
{
|
126
|
+
{duplicated_file: file_paths[duplicated_file]}
|
127
|
+
for duplicated_file in duplicates_file_names
|
121
128
|
]
|
122
129
|
|
123
130
|
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
@@ -129,11 +136,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
129
136
|
{self.FILES_KEY: list(group[1])}
|
130
137
|
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
131
138
|
]
|
132
|
-
if slices and not self.
|
139
|
+
if slices and not self.preserve_subdirectories_directories:
|
133
140
|
duplicated_files_names = self._duplicated_files_names(slices)
|
134
141
|
if duplicated_files_names:
|
135
142
|
raise DuplicatedFilesError(
|
136
|
-
|
143
|
+
format_duplicate_files_error_message(
|
144
|
+
stream_name=self.name, duplicated_files_names=duplicated_files_names
|
145
|
+
),
|
146
|
+
stream=self.name,
|
137
147
|
)
|
138
148
|
return slices
|
139
149
|
|
@@ -81,10 +81,10 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
81
81
|
Override to define additional parameters
|
82
82
|
"""
|
83
83
|
payload: MutableMapping[str, Any] = {
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
84
|
+
"grant_type": self.get_grant_type(),
|
85
|
+
"client_id": self.get_client_id(),
|
86
|
+
"client_secret": self.get_client_secret(),
|
87
|
+
"refresh_token": self.get_refresh_token(),
|
88
88
|
}
|
89
89
|
|
90
90
|
if self.get_scopes():
|
@@ -206,26 +206,14 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
206
206
|
def get_token_refresh_endpoint(self) -> Optional[str]:
|
207
207
|
"""Returns the endpoint to refresh the access token"""
|
208
208
|
|
209
|
-
@abstractmethod
|
210
|
-
def get_client_id_name(self) -> str:
|
211
|
-
"""The client id name to authenticate"""
|
212
|
-
|
213
209
|
@abstractmethod
|
214
210
|
def get_client_id(self) -> str:
|
215
211
|
"""The client id to authenticate"""
|
216
212
|
|
217
|
-
@abstractmethod
|
218
|
-
def get_client_secret_name(self) -> str:
|
219
|
-
"""The client secret name to authenticate"""
|
220
|
-
|
221
213
|
@abstractmethod
|
222
214
|
def get_client_secret(self) -> str:
|
223
215
|
"""The client secret to authenticate"""
|
224
216
|
|
225
|
-
@abstractmethod
|
226
|
-
def get_refresh_token_name(self) -> str:
|
227
|
-
"""The refresh token name to authenticate"""
|
228
|
-
|
229
217
|
@abstractmethod
|
230
218
|
def get_refresh_token(self) -> Optional[str]:
|
231
219
|
"""The token used to refresh the access token when it expires"""
|
@@ -258,10 +246,6 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
258
246
|
def get_grant_type(self) -> str:
|
259
247
|
"""Returns grant_type specified for requesting access_token"""
|
260
248
|
|
261
|
-
@abstractmethod
|
262
|
-
def get_grant_type_name(self) -> str:
|
263
|
-
"""Returns grant_type specified name for requesting access_token"""
|
264
|
-
|
265
249
|
@property
|
266
250
|
@abstractmethod
|
267
251
|
def access_token(self) -> str:
|