airbyte-cdk 6.13.1.dev4109__py3-none-any.whl → 6.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/entrypoint.py +1 -13
- airbyte_cdk/sources/declarative/auth/oauth.py +0 -26
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +51 -24
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +20 -128
- airbyte_cdk/sources/declarative/extractors/__init__.py +0 -2
- airbyte_cdk/sources/declarative/extractors/record_selector.py +7 -5
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +11 -97
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +14 -71
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +4 -33
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +35 -52
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +7 -10
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +4 -9
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +6 -11
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +13 -13
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +13 -14
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +7 -6
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +10 -10
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +4 -1
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +64 -71
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -4
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +1 -3
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +0 -11
- airbyte_cdk/sources/file_based/exceptions.py +0 -34
- airbyte_cdk/sources/file_based/file_based_source.py +5 -28
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +4 -18
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +2 -25
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +2 -30
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +4 -20
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +4 -34
- airbyte_cdk/sources/types.py +0 -3
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/RECORD +35 -38
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/WHEEL +1 -1
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +0 -55
- airbyte_cdk/sources/declarative/requesters/README.md +0 -57
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +0 -61
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,18 @@ import json
|
|
6
6
|
from dataclasses import InitVar, dataclass, field
|
7
7
|
from functools import partial
|
8
8
|
from itertools import islice
|
9
|
-
from typing import
|
9
|
+
from typing import (
|
10
|
+
Any,
|
11
|
+
Callable,
|
12
|
+
Iterable,
|
13
|
+
List,
|
14
|
+
Mapping,
|
15
|
+
MutableMapping,
|
16
|
+
Optional,
|
17
|
+
Set,
|
18
|
+
Tuple,
|
19
|
+
Union,
|
20
|
+
)
|
10
21
|
|
11
22
|
import requests
|
12
23
|
|
@@ -79,6 +90,9 @@ class SimpleRetriever(Retriever):
|
|
79
90
|
|
80
91
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
81
92
|
self._paginator = self.paginator or NoPagination(parameters=parameters)
|
93
|
+
self._last_response: Optional[requests.Response] = None
|
94
|
+
self._last_page_size: int = 0
|
95
|
+
self._last_record: Optional[Record] = None
|
82
96
|
self._parameters = parameters
|
83
97
|
self._name = (
|
84
98
|
InterpolatedString(self._name, parameters=parameters)
|
@@ -86,6 +100,10 @@ class SimpleRetriever(Retriever):
|
|
86
100
|
else self._name
|
87
101
|
)
|
88
102
|
|
103
|
+
# This mapping is used during a resumable full refresh syncs to indicate whether a partition has started syncing
|
104
|
+
# records. Partitions serve as the key and map to True if they already began processing records
|
105
|
+
self._partition_started: MutableMapping[Any, bool] = dict()
|
106
|
+
|
89
107
|
@property # type: ignore
|
90
108
|
def name(self) -> str:
|
91
109
|
"""
|
@@ -233,13 +251,17 @@ class SimpleRetriever(Retriever):
|
|
233
251
|
raise ValueError("Request body json cannot be a string")
|
234
252
|
return body_json
|
235
253
|
|
236
|
-
def _paginator_path(
|
254
|
+
def _paginator_path(
|
255
|
+
self,
|
256
|
+
) -> Optional[str]:
|
237
257
|
"""
|
238
258
|
If the paginator points to a path, follow it, else return nothing so the requester is used.
|
259
|
+
:param stream_state:
|
260
|
+
:param stream_slice:
|
239
261
|
:param next_page_token:
|
240
262
|
:return:
|
241
263
|
"""
|
242
|
-
return self._paginator.path(
|
264
|
+
return self._paginator.path()
|
243
265
|
|
244
266
|
def _parse_response(
|
245
267
|
self,
|
@@ -250,15 +272,22 @@ class SimpleRetriever(Retriever):
|
|
250
272
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
251
273
|
) -> Iterable[Record]:
|
252
274
|
if not response:
|
275
|
+
self._last_response = None
|
253
276
|
yield from []
|
254
277
|
else:
|
255
|
-
|
278
|
+
self._last_response = response
|
279
|
+
record_generator = self.record_selector.select_records(
|
256
280
|
response=response,
|
257
281
|
stream_state=stream_state,
|
258
282
|
records_schema=records_schema,
|
259
283
|
stream_slice=stream_slice,
|
260
284
|
next_page_token=next_page_token,
|
261
285
|
)
|
286
|
+
self._last_page_size = 0
|
287
|
+
for record in record_generator:
|
288
|
+
self._last_page_size += 1
|
289
|
+
self._last_record = record
|
290
|
+
yield record
|
262
291
|
|
263
292
|
@property # type: ignore
|
264
293
|
def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
|
@@ -270,13 +299,7 @@ class SimpleRetriever(Retriever):
|
|
270
299
|
if not isinstance(value, property):
|
271
300
|
self._primary_key = value
|
272
301
|
|
273
|
-
def _next_page_token(
|
274
|
-
self,
|
275
|
-
response: requests.Response,
|
276
|
-
last_page_size: int,
|
277
|
-
last_record: Optional[Record],
|
278
|
-
last_page_token_value: Optional[Any],
|
279
|
-
) -> Optional[Mapping[str, Any]]:
|
302
|
+
def _next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
|
280
303
|
"""
|
281
304
|
Specifies a pagination strategy.
|
282
305
|
|
@@ -284,12 +307,7 @@ class SimpleRetriever(Retriever):
|
|
284
307
|
|
285
308
|
:return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
|
286
309
|
"""
|
287
|
-
return self._paginator.next_page_token(
|
288
|
-
response=response,
|
289
|
-
last_page_size=last_page_size,
|
290
|
-
last_record=last_record,
|
291
|
-
last_page_token_value=last_page_token_value,
|
292
|
-
)
|
310
|
+
return self._paginator.next_page_token(response, self._last_page_size, self._last_record)
|
293
311
|
|
294
312
|
def _fetch_next_page(
|
295
313
|
self,
|
@@ -298,7 +316,7 @@ class SimpleRetriever(Retriever):
|
|
298
316
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
299
317
|
) -> Optional[requests.Response]:
|
300
318
|
return self.requester.send_request(
|
301
|
-
path=self._paginator_path(
|
319
|
+
path=self._paginator_path(),
|
302
320
|
stream_state=stream_state,
|
303
321
|
stream_slice=stream_slice,
|
304
322
|
next_page_token=next_page_token,
|
@@ -327,37 +345,20 @@ class SimpleRetriever(Retriever):
|
|
327
345
|
# This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well.
|
328
346
|
def _read_pages(
|
329
347
|
self,
|
330
|
-
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[
|
348
|
+
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
|
331
349
|
stream_state: Mapping[str, Any],
|
332
350
|
stream_slice: StreamSlice,
|
333
|
-
) -> Iterable[
|
351
|
+
) -> Iterable[StreamData]:
|
334
352
|
pagination_complete = False
|
335
|
-
|
336
|
-
next_page_token: Optional[Mapping[str, Any]] = (
|
337
|
-
{"next_page_token": initial_token} if initial_token else None
|
338
|
-
)
|
353
|
+
next_page_token = None
|
339
354
|
while not pagination_complete:
|
340
355
|
response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
|
341
|
-
|
342
|
-
last_page_size = 0
|
343
|
-
last_record: Optional[Record] = None
|
344
|
-
for record in records_generator_fn(response):
|
345
|
-
last_page_size += 1
|
346
|
-
last_record = record
|
347
|
-
yield record
|
356
|
+
yield from records_generator_fn(response)
|
348
357
|
|
349
358
|
if not response:
|
350
359
|
pagination_complete = True
|
351
360
|
else:
|
352
|
-
|
353
|
-
next_page_token.get("next_page_token") if next_page_token else None
|
354
|
-
)
|
355
|
-
next_page_token = self._next_page_token(
|
356
|
-
response=response,
|
357
|
-
last_page_size=last_page_size,
|
358
|
-
last_record=last_record,
|
359
|
-
last_page_token_value=last_page_token_value,
|
360
|
-
)
|
361
|
+
next_page_token = self._next_page_token(response)
|
361
362
|
if not next_page_token:
|
362
363
|
pagination_complete = True
|
363
364
|
|
@@ -366,38 +367,19 @@ class SimpleRetriever(Retriever):
|
|
366
367
|
|
367
368
|
def _read_single_page(
|
368
369
|
self,
|
369
|
-
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[
|
370
|
+
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
|
370
371
|
stream_state: Mapping[str, Any],
|
371
372
|
stream_slice: StreamSlice,
|
372
373
|
) -> Iterable[StreamData]:
|
373
|
-
|
374
|
-
|
375
|
-
initial_token = self._paginator.get_initial_token()
|
376
|
-
next_page_token: Optional[Mapping[str, Any]] = (
|
377
|
-
{"next_page_token": initial_token} if initial_token else None
|
378
|
-
)
|
379
|
-
|
380
|
-
response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
|
381
|
-
|
382
|
-
last_page_size = 0
|
383
|
-
last_record: Optional[Record] = None
|
384
|
-
for record in records_generator_fn(response):
|
385
|
-
last_page_size += 1
|
386
|
-
last_record = record
|
387
|
-
yield record
|
374
|
+
response = self._fetch_next_page(stream_state, stream_slice)
|
375
|
+
yield from records_generator_fn(response)
|
388
376
|
|
389
377
|
if not response:
|
390
|
-
next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
378
|
+
next_page_token: Mapping[str, Any] = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
391
379
|
else:
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
next_page_token = self._next_page_token(
|
396
|
-
response=response,
|
397
|
-
last_page_size=last_page_size,
|
398
|
-
last_record=last_record,
|
399
|
-
last_page_token_value=last_page_token_value,
|
400
|
-
) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
380
|
+
next_page_token = self._next_page_token(response) or {
|
381
|
+
FULL_REFRESH_SYNC_COMPLETE_KEY: True
|
382
|
+
}
|
401
383
|
|
402
384
|
if self.cursor:
|
403
385
|
self.cursor.close_slice(
|
@@ -432,14 +414,25 @@ class SimpleRetriever(Retriever):
|
|
432
414
|
if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
|
433
415
|
stream_state = self.state
|
434
416
|
|
435
|
-
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
436
|
-
#
|
437
|
-
#
|
417
|
+
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to fetch more records
|
418
|
+
# The platform deletes stream state for full refresh streams before starting a new job, so we don't need to worry about
|
419
|
+
# this value existing for the initial attempt
|
438
420
|
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
439
421
|
return
|
422
|
+
cursor_value = stream_state.get("next_page_token")
|
423
|
+
|
424
|
+
# The first attempt to read a page for the current partition should reset the paginator to the current
|
425
|
+
# cursor state which is initially assigned to the incoming state from the platform
|
426
|
+
partition_key = self._to_partition_key(_slice.partition)
|
427
|
+
if partition_key not in self._partition_started:
|
428
|
+
self._partition_started[partition_key] = True
|
429
|
+
self._paginator.reset(reset_value=cursor_value)
|
440
430
|
|
441
431
|
yield from self._read_single_page(record_generator, stream_state, _slice)
|
442
432
|
else:
|
433
|
+
# Fixing paginator types has a long tail of dependencies
|
434
|
+
self._paginator.reset()
|
435
|
+
|
443
436
|
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
444
437
|
current_record = self._extract_record(stream_data, _slice)
|
445
438
|
if self.cursor and current_record:
|
@@ -525,7 +518,7 @@ class SimpleRetriever(Retriever):
|
|
525
518
|
stream_state: Mapping[str, Any],
|
526
519
|
records_schema: Mapping[str, Any],
|
527
520
|
stream_slice: Optional[StreamSlice],
|
528
|
-
) -> Iterable[
|
521
|
+
) -> Iterable[StreamData]:
|
529
522
|
yield from self._parse_response(
|
530
523
|
response,
|
531
524
|
stream_slice=stream_slice,
|
@@ -569,7 +562,7 @@ class SimpleRetrieverTestReadDecorator(SimpleRetriever):
|
|
569
562
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
570
563
|
) -> Optional[requests.Response]:
|
571
564
|
return self.requester.send_request(
|
572
|
-
path=self._paginator_path(
|
565
|
+
path=self._paginator_path(),
|
573
566
|
stream_state=stream_state,
|
574
567
|
stream_slice=stream_slice,
|
575
568
|
next_page_token=next_page_token,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
2
|
|
3
|
-
from typing import Any, Iterable, Mapping, Optional
|
3
|
+
from typing import Any, Callable, Iterable, Mapping, Optional
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
6
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -16,7 +16,7 @@ class DeclarativePartitionFactory:
|
|
16
16
|
self,
|
17
17
|
stream_name: str,
|
18
18
|
json_schema: Mapping[str, Any],
|
19
|
-
|
19
|
+
retriever_factory: Callable[[], Retriever],
|
20
20
|
message_repository: MessageRepository,
|
21
21
|
) -> None:
|
22
22
|
"""
|
@@ -26,14 +26,14 @@ class DeclarativePartitionFactory:
|
|
26
26
|
"""
|
27
27
|
self._stream_name = stream_name
|
28
28
|
self._json_schema = json_schema
|
29
|
-
self.
|
29
|
+
self._retriever_factory = retriever_factory
|
30
30
|
self._message_repository = message_repository
|
31
31
|
|
32
32
|
def create(self, stream_slice: StreamSlice) -> Partition:
|
33
33
|
return DeclarativePartition(
|
34
34
|
self._stream_name,
|
35
35
|
self._json_schema,
|
36
|
-
self.
|
36
|
+
self._retriever_factory(),
|
37
37
|
self._message_repository,
|
38
38
|
stream_slice,
|
39
39
|
)
|
@@ -11,8 +11,6 @@ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
|
11
11
|
|
12
12
|
@dataclass
|
13
13
|
class FlattenFields(RecordTransformation):
|
14
|
-
flatten_lists: bool = True
|
15
|
-
|
16
14
|
def transform(
|
17
15
|
self,
|
18
16
|
record: Dict[str, Any],
|
@@ -41,7 +39,7 @@ class FlattenFields(RecordTransformation):
|
|
41
39
|
)
|
42
40
|
stack.append((value, new_key))
|
43
41
|
|
44
|
-
elif isinstance(current_record, list)
|
42
|
+
elif isinstance(current_record, list):
|
45
43
|
for i, item in enumerate(current_record):
|
46
44
|
force_with_parent_name = True
|
47
45
|
stack.append((item, f"{parent_key}.{i}"))
|
@@ -31,17 +31,6 @@ class DeliverRawFiles(BaseModel):
|
|
31
31
|
|
32
32
|
delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
|
33
33
|
|
34
|
-
preserve_directory_structure: bool = Field(
|
35
|
-
title="Preserve Sub-Directories in File Paths",
|
36
|
-
description=(
|
37
|
-
"If enabled, sends subdirectory folder structure "
|
38
|
-
"along with source file names to the destination. "
|
39
|
-
"Otherwise, files will be synced by their names only. "
|
40
|
-
"This option is ignored when file-based replication is not enabled."
|
41
|
-
),
|
42
|
-
default=True,
|
43
|
-
)
|
44
|
-
|
45
34
|
|
46
35
|
class AbstractFileBasedSpec(BaseModel):
|
47
36
|
"""
|
@@ -111,40 +111,6 @@ class ErrorListingFiles(BaseFileBasedSourceError):
|
|
111
111
|
pass
|
112
112
|
|
113
113
|
|
114
|
-
class DuplicatedFilesError(BaseFileBasedSourceError):
|
115
|
-
def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
|
116
|
-
self._duplicated_files_names = duplicated_files_names
|
117
|
-
self._stream_name: str = kwargs["stream"]
|
118
|
-
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
|
119
|
-
|
120
|
-
def _format_duplicate_files_error_message(self) -> str:
|
121
|
-
duplicated_files_messages = []
|
122
|
-
for duplicated_file in self._duplicated_files_names:
|
123
|
-
for duplicated_file_name, file_paths in duplicated_file.items():
|
124
|
-
file_duplicated_message = (
|
125
|
-
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
126
|
-
+ "".join(f"\n - {file_paths}")
|
127
|
-
)
|
128
|
-
duplicated_files_messages.append(file_duplicated_message)
|
129
|
-
|
130
|
-
error_message = (
|
131
|
-
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
|
132
|
-
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
|
133
|
-
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
134
|
-
+ "\n".join(duplicated_files_messages)
|
135
|
-
)
|
136
|
-
|
137
|
-
return error_message
|
138
|
-
|
139
|
-
def __repr__(self) -> str:
|
140
|
-
"""Return a string representation of the exception."""
|
141
|
-
class_name = self.__class__.__name__
|
142
|
-
properties_str = ", ".join(
|
143
|
-
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
144
|
-
)
|
145
|
-
return f"{class_name}({properties_str})"
|
146
|
-
|
147
|
-
|
148
114
|
class CustomFileBasedException(AirbyteTracedException):
|
149
115
|
"""
|
150
116
|
A specialized exception for file-based connectors.
|
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
242
242
|
stream=self._make_default_stream(
|
243
243
|
stream_config=stream_config,
|
244
244
|
cursor=cursor,
|
245
|
-
|
245
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
246
246
|
),
|
247
247
|
source=self,
|
248
248
|
logger=self.logger,
|
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
273
273
|
stream=self._make_default_stream(
|
274
274
|
stream_config=stream_config,
|
275
275
|
cursor=cursor,
|
276
|
-
|
276
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
277
277
|
),
|
278
278
|
source=self,
|
279
279
|
logger=self.logger,
|
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
285
285
|
stream = self._make_default_stream(
|
286
286
|
stream_config=stream_config,
|
287
287
|
cursor=cursor,
|
288
|
-
|
288
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
289
289
|
)
|
290
290
|
|
291
291
|
streams.append(stream)
|
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
298
298
|
self,
|
299
299
|
stream_config: FileBasedStreamConfig,
|
300
300
|
cursor: Optional[AbstractFileBasedCursor],
|
301
|
-
|
301
|
+
use_file_transfer: bool = False,
|
302
302
|
) -> AbstractFileBasedStream:
|
303
303
|
return DefaultFileBasedStream(
|
304
304
|
config=stream_config,
|
@@ -310,8 +310,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
310
310
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
311
311
|
errors_collector=self.errors_collector,
|
312
312
|
cursor=cursor,
|
313
|
-
use_file_transfer=
|
314
|
-
preserve_directory_structure=self._preserve_directory_structure(parsed_config),
|
313
|
+
use_file_transfer=use_file_transfer,
|
315
314
|
)
|
316
315
|
|
317
316
|
def _get_stream_from_catalog(
|
@@ -386,25 +385,3 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
386
385
|
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
387
386
|
)
|
388
387
|
return use_file_transfer
|
389
|
-
|
390
|
-
@staticmethod
|
391
|
-
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
392
|
-
"""
|
393
|
-
Determines whether to preserve directory structure during file transfer.
|
394
|
-
|
395
|
-
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
-
When disabled, files are flattened to the root of the destination.
|
397
|
-
|
398
|
-
Args:
|
399
|
-
parsed_config: The parsed configuration containing delivery method settings
|
400
|
-
|
401
|
-
Returns:
|
402
|
-
True if directory structure should be preserved (default), False otherwise
|
403
|
-
"""
|
404
|
-
if (
|
405
|
-
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
-
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
|
407
|
-
and parsed_config.delivery_method.preserve_directory_structure is not None
|
408
|
-
):
|
409
|
-
return parsed_config.delivery_method.preserve_directory_structure
|
410
|
-
return True
|
@@ -135,17 +135,6 @@ class AbstractFileBasedStreamReader(ABC):
|
|
135
135
|
return use_file_transfer
|
136
136
|
return False
|
137
137
|
|
138
|
-
def preserve_directory_structure(self) -> bool:
|
139
|
-
# fall back to preserve subdirectories if config is not present or incomplete
|
140
|
-
if (
|
141
|
-
self.use_file_transfer()
|
142
|
-
and self.config
|
143
|
-
and hasattr(self.config.delivery_method, "preserve_directory_structure")
|
144
|
-
and self.config.delivery_method.preserve_directory_structure is not None
|
145
|
-
):
|
146
|
-
return self.config.delivery_method.preserve_directory_structure
|
147
|
-
return True
|
148
|
-
|
149
138
|
@abstractmethod
|
150
139
|
def get_file(
|
151
140
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
@@ -170,13 +159,10 @@ class AbstractFileBasedStreamReader(ABC):
|
|
170
159
|
"""
|
171
160
|
...
|
172
161
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
file_relative_path = file.uri.lstrip("/")
|
178
|
-
else:
|
179
|
-
file_relative_path = path.basename(file.uri)
|
162
|
+
@staticmethod
|
163
|
+
def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
|
164
|
+
# Remove left slashes from source path format to make relative path for writing locally
|
165
|
+
file_relative_path = file.uri.lstrip("/")
|
180
166
|
local_file_path = path.join(local_directory, file_relative_path)
|
181
167
|
|
182
168
|
# Ensure the local directory exists
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
import os
|
6
5
|
import traceback
|
7
6
|
from datetime import datetime
|
8
7
|
from io import BytesIO, IOBase
|
@@ -43,34 +42,12 @@ unstructured_partition_pdf = None
|
|
43
42
|
unstructured_partition_docx = None
|
44
43
|
unstructured_partition_pptx = None
|
45
44
|
|
46
|
-
AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
|
47
|
-
TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
|
48
|
-
|
49
|
-
|
50
|
-
def get_nltk_temp_folder() -> str:
|
51
|
-
"""
|
52
|
-
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
|
53
|
-
It's safe to use /airbyte for now. Fallback to /tmp for local development.
|
54
|
-
"""
|
55
|
-
try:
|
56
|
-
nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
|
57
|
-
os.makedirs(nltk_data_dir, exist_ok=True)
|
58
|
-
except OSError:
|
59
|
-
nltk_data_dir = TMP_NLTK_DATA_DIR
|
60
|
-
os.makedirs(nltk_data_dir, exist_ok=True)
|
61
|
-
return nltk_data_dir
|
62
|
-
|
63
|
-
|
64
45
|
try:
|
65
|
-
nltk_data_dir = get_nltk_temp_folder()
|
66
|
-
nltk.data.path.append(nltk_data_dir)
|
67
46
|
nltk.data.find("tokenizers/punkt.zip")
|
68
47
|
nltk.data.find("tokenizers/punkt_tab.zip")
|
69
|
-
nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
|
70
48
|
except LookupError:
|
71
|
-
nltk.download("punkt"
|
72
|
-
nltk.download("punkt_tab"
|
73
|
-
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
|
49
|
+
nltk.download("punkt")
|
50
|
+
nltk.download("punkt_tab")
|
74
51
|
|
75
52
|
|
76
53
|
def optional_decode(contents: Union[str, bytes]) -> str:
|
@@ -5,17 +5,14 @@
|
|
5
5
|
import asyncio
|
6
6
|
import itertools
|
7
7
|
import traceback
|
8
|
-
from collections import defaultdict
|
9
8
|
from copy import deepcopy
|
10
9
|
from functools import cache
|
11
|
-
from
|
12
|
-
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
|
10
|
+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
|
13
11
|
|
14
12
|
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
|
15
13
|
from airbyte_cdk.models import Type as MessageType
|
16
14
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
17
15
|
from airbyte_cdk.sources.file_based.exceptions import (
|
18
|
-
DuplicatedFilesError,
|
19
16
|
FileBasedSourceError,
|
20
17
|
InvalidSchemaError,
|
21
18
|
MissingSchemaError,
|
@@ -46,8 +43,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
46
43
|
"""
|
47
44
|
|
48
45
|
FILE_TRANSFER_KW = "use_file_transfer"
|
49
|
-
PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
|
50
|
-
FILES_KEY = "files"
|
51
46
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
52
47
|
ab_last_mod_col = "_ab_source_file_last_modified"
|
53
48
|
ab_file_name_col = "_ab_source_file_url"
|
@@ -55,15 +50,10 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
55
50
|
source_file_url = "source_file_url"
|
56
51
|
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
57
52
|
use_file_transfer = False
|
58
|
-
preserve_directory_structure = True
|
59
53
|
|
60
54
|
def __init__(self, **kwargs: Any):
|
61
55
|
if self.FILE_TRANSFER_KW in kwargs:
|
62
56
|
self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
|
63
|
-
if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
|
64
|
-
self.preserve_directory_structure = kwargs.pop(
|
65
|
-
self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
|
66
|
-
)
|
67
57
|
super().__init__(**kwargs)
|
68
58
|
|
69
59
|
@property
|
@@ -108,33 +98,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
108
98
|
else:
|
109
99
|
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
110
100
|
|
111
|
-
def _duplicated_files_names(
|
112
|
-
self, slices: List[dict[str, List[RemoteFile]]]
|
113
|
-
) -> List[dict[str, List[str]]]:
|
114
|
-
seen_file_names: Dict[str, List[str]] = defaultdict(list)
|
115
|
-
for file_slice in slices:
|
116
|
-
for file_found in file_slice[self.FILES_KEY]:
|
117
|
-
file_name = path.basename(file_found.uri)
|
118
|
-
seen_file_names[file_name].append(file_found.uri)
|
119
|
-
return [
|
120
|
-
{file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
|
121
|
-
]
|
122
|
-
|
123
101
|
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
124
102
|
# Sort files by last_modified, uri and return them grouped by last_modified
|
125
103
|
all_files = self.list_files()
|
126
104
|
files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
|
127
105
|
sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
|
128
106
|
slices = [
|
129
|
-
{
|
107
|
+
{"files": list(group[1])}
|
130
108
|
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
131
109
|
]
|
132
|
-
if slices and not self.preserve_directory_structure:
|
133
|
-
duplicated_files_names = self._duplicated_files_names(slices)
|
134
|
-
if duplicated_files_names:
|
135
|
-
raise DuplicatedFilesError(
|
136
|
-
stream=self.name, duplicated_files_names=duplicated_files_names
|
137
|
-
)
|
138
110
|
return slices
|
139
111
|
|
140
112
|
def transform_record(
|
@@ -81,10 +81,10 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
81
81
|
Override to define additional parameters
|
82
82
|
"""
|
83
83
|
payload: MutableMapping[str, Any] = {
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
84
|
+
"grant_type": self.get_grant_type(),
|
85
|
+
"client_id": self.get_client_id(),
|
86
|
+
"client_secret": self.get_client_secret(),
|
87
|
+
"refresh_token": self.get_refresh_token(),
|
88
88
|
}
|
89
89
|
|
90
90
|
if self.get_scopes():
|
@@ -206,26 +206,14 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
206
206
|
def get_token_refresh_endpoint(self) -> Optional[str]:
|
207
207
|
"""Returns the endpoint to refresh the access token"""
|
208
208
|
|
209
|
-
@abstractmethod
|
210
|
-
def get_client_id_name(self) -> str:
|
211
|
-
"""The client id name to authenticate"""
|
212
|
-
|
213
209
|
@abstractmethod
|
214
210
|
def get_client_id(self) -> str:
|
215
211
|
"""The client id to authenticate"""
|
216
212
|
|
217
|
-
@abstractmethod
|
218
|
-
def get_client_secret_name(self) -> str:
|
219
|
-
"""The client secret name to authenticate"""
|
220
|
-
|
221
213
|
@abstractmethod
|
222
214
|
def get_client_secret(self) -> str:
|
223
215
|
"""The client secret to authenticate"""
|
224
216
|
|
225
|
-
@abstractmethod
|
226
|
-
def get_refresh_token_name(self) -> str:
|
227
|
-
"""The refresh token name to authenticate"""
|
228
|
-
|
229
217
|
@abstractmethod
|
230
218
|
def get_refresh_token(self) -> Optional[str]:
|
231
219
|
"""The token used to refresh the access token when it expires"""
|
@@ -258,10 +246,6 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
258
246
|
def get_grant_type(self) -> str:
|
259
247
|
"""Returns grant_type specified for requesting access_token"""
|
260
248
|
|
261
|
-
@abstractmethod
|
262
|
-
def get_grant_type_name(self) -> str:
|
263
|
-
"""Returns grant_type specified name for requesting access_token"""
|
264
|
-
|
265
249
|
@property
|
266
250
|
@abstractmethod
|
267
251
|
def access_token(self) -> str:
|