airbyte-cdk 6.13.0.dev0__py3-none-any.whl → 6.13.1.dev4100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +8 -25
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +35 -52
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +7 -10
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +4 -9
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +6 -11
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +5 -16
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +13 -14
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +8 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +7 -10
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +64 -71
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +18 -0
- airbyte_cdk/sources/file_based/exceptions.py +26 -0
- airbyte_cdk/sources/file_based/file_based_source.py +19 -5
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +18 -4
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +40 -2
- {airbyte_cdk-6.13.0.dev0.dist-info → airbyte_cdk-6.13.1.dev4100.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.13.0.dev0.dist-info → airbyte_cdk-6.13.1.dev4100.dist-info}/RECORD +20 -20
- {airbyte_cdk-6.13.0.dev0.dist-info → airbyte_cdk-6.13.1.dev4100.dist-info}/WHEEL +1 -1
- {airbyte_cdk-6.13.0.dev0.dist-info → airbyte_cdk-6.13.1.dev4100.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.13.0.dev0.dist-info → airbyte_cdk-6.13.1.dev4100.dist-info}/entry_points.txt +0 -0
@@ -194,11 +194,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
194
194
|
# Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect
|
195
195
|
# these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
|
196
196
|
# so we need to treat them as synchronous
|
197
|
-
if
|
198
|
-
|
197
|
+
if (
|
198
|
+
isinstance(declarative_stream, DeclarativeStream)
|
199
|
+
and name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
|
199
200
|
== "SimpleRetriever"
|
200
|
-
or name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
|
201
|
-
== "AsyncRetriever"
|
202
201
|
):
|
203
202
|
incremental_sync_component_definition = name_to_stream_mapping[
|
204
203
|
declarative_stream.name
|
@@ -218,11 +217,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
218
217
|
and not incremental_sync_component_definition
|
219
218
|
)
|
220
219
|
|
221
|
-
is_async_job_stream = (
|
222
|
-
name_to_stream_mapping[declarative_stream.name].get("retriever", {}).get("type")
|
223
|
-
== "AsyncRetriever"
|
224
|
-
)
|
225
|
-
|
226
220
|
if self._is_datetime_incremental_without_partition_routing(
|
227
221
|
declarative_stream, incremental_sync_component_definition
|
228
222
|
):
|
@@ -274,26 +268,15 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
274
268
|
elif (
|
275
269
|
is_substream_without_incremental or is_without_partition_router_or_cursor
|
276
270
|
) and hasattr(declarative_stream.retriever, "stream_slicer"):
|
277
|
-
if is_async_job_stream:
|
278
|
-
# A stream's AsyncRetriever must be shared across all partitions because it uses a
|
279
|
-
# shared JobRepository to manage the state of jobs requests and when they are ready
|
280
|
-
async_retriever = declarative_stream.retriever
|
281
|
-
|
282
|
-
def async_retriever_factory_method() -> Retriever:
|
283
|
-
return async_retriever
|
284
|
-
|
285
|
-
retriever_factory = async_retriever_factory_method
|
286
|
-
else:
|
287
|
-
retriever_factory = self._retriever_factory(
|
288
|
-
name_to_stream_mapping[declarative_stream.name],
|
289
|
-
config,
|
290
|
-
{},
|
291
|
-
)
|
292
271
|
partition_generator = StreamSlicerPartitionGenerator(
|
293
272
|
DeclarativePartitionFactory(
|
294
273
|
declarative_stream.name,
|
295
274
|
declarative_stream.get_json_schema(),
|
296
|
-
|
275
|
+
self._retriever_factory(
|
276
|
+
name_to_stream_mapping[declarative_stream.name],
|
277
|
+
config,
|
278
|
+
{},
|
279
|
+
),
|
297
280
|
self.message_repository,
|
298
281
|
),
|
299
282
|
declarative_stream.retriever.stream_slicer,
|
@@ -112,39 +112,27 @@ class DefaultPaginator(Paginator):
|
|
112
112
|
)
|
113
113
|
if isinstance(self.url_base, str):
|
114
114
|
self.url_base = InterpolatedString(string=self.url_base, parameters=parameters)
|
115
|
-
|
116
|
-
def get_initial_token(self) -> Optional[Any]:
|
117
|
-
"""
|
118
|
-
Return the page token that should be used for the first request of a stream
|
119
|
-
|
120
|
-
WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing
|
121
|
-
of state using page numbers. Because paginators are stateless
|
122
|
-
"""
|
123
|
-
return self.pagination_strategy.initial_token
|
115
|
+
self._token: Optional[Any] = self.pagination_strategy.initial_token
|
124
116
|
|
125
117
|
def next_page_token(
|
126
|
-
self,
|
127
|
-
response: requests.Response,
|
128
|
-
last_page_size: int,
|
129
|
-
last_record: Optional[Record],
|
130
|
-
last_page_token_value: Optional[Any] = None,
|
118
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
131
119
|
) -> Optional[Mapping[str, Any]]:
|
132
|
-
|
133
|
-
response
|
134
|
-
last_page_size=last_page_size,
|
135
|
-
last_record=last_record,
|
136
|
-
last_page_token_value=last_page_token_value,
|
120
|
+
self._token = self.pagination_strategy.next_page_token(
|
121
|
+
response, last_page_size, last_record
|
137
122
|
)
|
138
|
-
if
|
139
|
-
return {"next_page_token":
|
123
|
+
if self._token:
|
124
|
+
return {"next_page_token": self._token}
|
140
125
|
else:
|
141
126
|
return None
|
142
127
|
|
143
|
-
def path(self
|
144
|
-
|
145
|
-
|
128
|
+
def path(self) -> Optional[str]:
|
129
|
+
if (
|
130
|
+
self._token
|
131
|
+
and self.page_token_option
|
132
|
+
and isinstance(self.page_token_option, RequestPath)
|
133
|
+
):
|
146
134
|
# Replace url base to only return the path
|
147
|
-
return str(
|
135
|
+
return str(self._token).replace(self.url_base.eval(self.config), "") # type: ignore # url_base is casted to a InterpolatedString in __post_init__
|
148
136
|
else:
|
149
137
|
return None
|
150
138
|
|
@@ -155,7 +143,7 @@ class DefaultPaginator(Paginator):
|
|
155
143
|
stream_slice: Optional[StreamSlice] = None,
|
156
144
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
157
145
|
) -> MutableMapping[str, Any]:
|
158
|
-
return self._get_request_options(RequestOptionType.request_parameter
|
146
|
+
return self._get_request_options(RequestOptionType.request_parameter)
|
159
147
|
|
160
148
|
def get_request_headers(
|
161
149
|
self,
|
@@ -164,7 +152,7 @@ class DefaultPaginator(Paginator):
|
|
164
152
|
stream_slice: Optional[StreamSlice] = None,
|
165
153
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
166
154
|
) -> Mapping[str, str]:
|
167
|
-
return self._get_request_options(RequestOptionType.header
|
155
|
+
return self._get_request_options(RequestOptionType.header)
|
168
156
|
|
169
157
|
def get_request_body_data(
|
170
158
|
self,
|
@@ -173,7 +161,7 @@ class DefaultPaginator(Paginator):
|
|
173
161
|
stream_slice: Optional[StreamSlice] = None,
|
174
162
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
175
163
|
) -> Mapping[str, Any]:
|
176
|
-
return self._get_request_options(RequestOptionType.body_data
|
164
|
+
return self._get_request_options(RequestOptionType.body_data)
|
177
165
|
|
178
166
|
def get_request_body_json(
|
179
167
|
self,
|
@@ -182,21 +170,25 @@ class DefaultPaginator(Paginator):
|
|
182
170
|
stream_slice: Optional[StreamSlice] = None,
|
183
171
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
184
172
|
) -> Mapping[str, Any]:
|
185
|
-
return self._get_request_options(RequestOptionType.body_json
|
173
|
+
return self._get_request_options(RequestOptionType.body_json)
|
186
174
|
|
187
|
-
def
|
188
|
-
|
189
|
-
|
175
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
176
|
+
if reset_value:
|
177
|
+
self.pagination_strategy.reset(reset_value=reset_value)
|
178
|
+
else:
|
179
|
+
self.pagination_strategy.reset()
|
180
|
+
self._token = self.pagination_strategy.initial_token
|
181
|
+
|
182
|
+
def _get_request_options(self, option_type: RequestOptionType) -> MutableMapping[str, Any]:
|
190
183
|
options = {}
|
191
184
|
|
192
|
-
token = next_page_token.get("next_page_token") if next_page_token else None
|
193
185
|
if (
|
194
186
|
self.page_token_option
|
195
|
-
and
|
187
|
+
and self._token is not None
|
196
188
|
and isinstance(self.page_token_option, RequestOption)
|
197
189
|
and self.page_token_option.inject_into == option_type
|
198
190
|
):
|
199
|
-
options[self.page_token_option.field_name.eval(config=self.config)] =
|
191
|
+
options[self.page_token_option.field_name.eval(config=self.config)] = self._token # type: ignore # field_name is always cast to an interpolated string
|
200
192
|
if (
|
201
193
|
self.page_size_option
|
202
194
|
and self.pagination_strategy.get_page_size()
|
@@ -212,9 +204,6 @@ class PaginatorTestReadDecorator(Paginator):
|
|
212
204
|
"""
|
213
205
|
In some cases, we want to limit the number of requests that are made to the backend source. This class allows for limiting the number of
|
214
206
|
pages that are queried throughout a read command.
|
215
|
-
|
216
|
-
WARNING: This decorator is not currently thread-safe like the rest of the low-code framework because it has
|
217
|
-
an internal state to track the current number of pages counted so that it can exit early during a test read
|
218
207
|
"""
|
219
208
|
|
220
209
|
_PAGE_COUNT_BEFORE_FIRST_NEXT_CALL = 1
|
@@ -228,27 +217,17 @@ class PaginatorTestReadDecorator(Paginator):
|
|
228
217
|
self._decorated = decorated
|
229
218
|
self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL
|
230
219
|
|
231
|
-
def get_initial_token(self) -> Optional[Any]:
|
232
|
-
self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL
|
233
|
-
return self._decorated.get_initial_token()
|
234
|
-
|
235
220
|
def next_page_token(
|
236
|
-
self,
|
237
|
-
response: requests.Response,
|
238
|
-
last_page_size: int,
|
239
|
-
last_record: Optional[Record],
|
240
|
-
last_page_token_value: Optional[Any] = None,
|
221
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
241
222
|
) -> Optional[Mapping[str, Any]]:
|
242
223
|
if self._page_count >= self._maximum_number_of_pages:
|
243
224
|
return None
|
244
225
|
|
245
226
|
self._page_count += 1
|
246
|
-
return self._decorated.next_page_token(
|
247
|
-
response, last_page_size, last_record, last_page_token_value
|
248
|
-
)
|
227
|
+
return self._decorated.next_page_token(response, last_page_size, last_record)
|
249
228
|
|
250
|
-
def path(self
|
251
|
-
return self._decorated.path(
|
229
|
+
def path(self) -> Optional[str]:
|
230
|
+
return self._decorated.path()
|
252
231
|
|
253
232
|
def get_request_params(
|
254
233
|
self,
|
@@ -293,3 +272,7 @@ class PaginatorTestReadDecorator(Paginator):
|
|
293
272
|
return self._decorated.get_request_body_json(
|
294
273
|
stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token
|
295
274
|
)
|
275
|
+
|
276
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
277
|
+
self._decorated.reset()
|
278
|
+
self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL
|
@@ -19,7 +19,7 @@ class NoPagination(Paginator):
|
|
19
19
|
|
20
20
|
parameters: InitVar[Mapping[str, Any]]
|
21
21
|
|
22
|
-
def path(self
|
22
|
+
def path(self) -> Optional[str]:
|
23
23
|
return None
|
24
24
|
|
25
25
|
def get_request_params(
|
@@ -58,14 +58,11 @@ class NoPagination(Paginator):
|
|
58
58
|
) -> Mapping[str, Any]:
|
59
59
|
return {}
|
60
60
|
|
61
|
-
def get_initial_token(self) -> Optional[Any]:
|
62
|
-
return None
|
63
|
-
|
64
61
|
def next_page_token(
|
65
|
-
self,
|
66
|
-
|
67
|
-
last_page_size: int,
|
68
|
-
last_record: Optional[Record],
|
69
|
-
last_page_token_value: Optional[Any],
|
70
|
-
) -> Optional[Mapping[str, Any]]:
|
62
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
63
|
+
) -> Mapping[str, Any]:
|
71
64
|
return {}
|
65
|
+
|
66
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
67
|
+
# No state to reset
|
68
|
+
pass
|
@@ -24,18 +24,14 @@ class Paginator(ABC, RequestOptionsProvider):
|
|
24
24
|
"""
|
25
25
|
|
26
26
|
@abstractmethod
|
27
|
-
def
|
27
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
28
28
|
"""
|
29
|
-
|
29
|
+
Reset the pagination's inner state
|
30
30
|
"""
|
31
31
|
|
32
32
|
@abstractmethod
|
33
33
|
def next_page_token(
|
34
|
-
self,
|
35
|
-
response: requests.Response,
|
36
|
-
last_page_size: int,
|
37
|
-
last_record: Optional[Record],
|
38
|
-
last_page_token_value: Optional[Any],
|
34
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
39
35
|
) -> Optional[Mapping[str, Any]]:
|
40
36
|
"""
|
41
37
|
Returns the next_page_token to use to fetch the next page of records.
|
@@ -43,13 +39,12 @@ class Paginator(ABC, RequestOptionsProvider):
|
|
43
39
|
:param response: the response to process
|
44
40
|
:param last_page_size: the number of records read from the response
|
45
41
|
:param last_record: the last record extracted from the response
|
46
|
-
:param last_page_token_value: The current value of the page token made on the last request
|
47
42
|
:return: A mapping {"next_page_token": <token>} for the next page from the input response object. Returning None means there are no more pages to read in this response.
|
48
43
|
"""
|
49
44
|
pass
|
50
45
|
|
51
46
|
@abstractmethod
|
52
|
-
def path(self
|
47
|
+
def path(self) -> Optional[str]:
|
53
48
|
"""
|
54
49
|
Returns the URL path to hit to fetch the next page of records
|
55
50
|
|
airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py
CHANGED
@@ -43,6 +43,7 @@ class CursorPaginationStrategy(PaginationStrategy):
|
|
43
43
|
)
|
44
44
|
|
45
45
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
46
|
+
self._initial_cursor = None
|
46
47
|
if isinstance(self.cursor_value, str):
|
47
48
|
self._cursor_value = InterpolatedString.create(self.cursor_value, parameters=parameters)
|
48
49
|
else:
|
@@ -56,19 +57,10 @@ class CursorPaginationStrategy(PaginationStrategy):
|
|
56
57
|
|
57
58
|
@property
|
58
59
|
def initial_token(self) -> Optional[Any]:
|
59
|
-
|
60
|
-
CursorPaginationStrategy does not have an initial value because the next cursor is typically included
|
61
|
-
in the response of the first request. For Resumable Full Refresh streams that checkpoint the page
|
62
|
-
cursor, the next cursor should be read from the state or stream slice object.
|
63
|
-
"""
|
64
|
-
return None
|
60
|
+
return self._initial_cursor
|
65
61
|
|
66
62
|
def next_page_token(
|
67
|
-
self,
|
68
|
-
response: requests.Response,
|
69
|
-
last_page_size: int,
|
70
|
-
last_record: Optional[Record],
|
71
|
-
last_page_token_value: Optional[Any] = None,
|
63
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
72
64
|
) -> Optional[Any]:
|
73
65
|
decoded_response = next(self.decoder.decode(response))
|
74
66
|
|
@@ -95,5 +87,8 @@ class CursorPaginationStrategy(PaginationStrategy):
|
|
95
87
|
)
|
96
88
|
return token if token else None
|
97
89
|
|
90
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
91
|
+
self._initial_cursor = reset_value
|
92
|
+
|
98
93
|
def get_page_size(self) -> Optional[int]:
|
99
94
|
return self.page_size
|
@@ -52,6 +52,7 @@ class OffsetIncrement(PaginationStrategy):
|
|
52
52
|
inject_on_first_request: bool = False
|
53
53
|
|
54
54
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
55
|
+
self._offset = 0
|
55
56
|
page_size = str(self.page_size) if isinstance(self.page_size, int) else self.page_size
|
56
57
|
if page_size:
|
57
58
|
self._page_size: Optional[InterpolatedString] = InterpolatedString(
|
@@ -63,15 +64,11 @@ class OffsetIncrement(PaginationStrategy):
|
|
63
64
|
@property
|
64
65
|
def initial_token(self) -> Optional[Any]:
|
65
66
|
if self.inject_on_first_request:
|
66
|
-
return
|
67
|
+
return self._offset
|
67
68
|
return None
|
68
69
|
|
69
70
|
def next_page_token(
|
70
|
-
self,
|
71
|
-
response: requests.Response,
|
72
|
-
last_page_size: int,
|
73
|
-
last_record: Optional[Record],
|
74
|
-
last_page_token_value: Optional[Any] = None,
|
71
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
75
72
|
) -> Optional[Any]:
|
76
73
|
decoded_response = next(self.decoder.decode(response))
|
77
74
|
|
@@ -81,17 +78,9 @@ class OffsetIncrement(PaginationStrategy):
|
|
81
78
|
and last_page_size < self._page_size.eval(self.config, response=decoded_response)
|
82
79
|
) or last_page_size == 0:
|
83
80
|
return None
|
84
|
-
elif last_page_token_value is None:
|
85
|
-
# If the OffsetIncrement strategy does not inject on the first request, the incoming last_page_token_value
|
86
|
-
# will be None. For this case, we assume that None was the first page and progress to the next offset
|
87
|
-
return 0 + last_page_size
|
88
|
-
elif not isinstance(last_page_token_value, int):
|
89
|
-
raise ValueError(
|
90
|
-
"The page token for a OffsetIncrement pagination strategy must be an integer"
|
91
|
-
)
|
92
81
|
else:
|
93
|
-
|
94
|
-
return
|
82
|
+
self._offset += last_page_size
|
83
|
+
return self._offset
|
95
84
|
|
96
85
|
def reset(self, reset_value: Optional[Any] = 0) -> None:
|
97
86
|
if not isinstance(reset_value, int):
|
@@ -31,6 +31,7 @@ class PageIncrement(PaginationStrategy):
|
|
31
31
|
inject_on_first_request: bool = False
|
32
32
|
|
33
33
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
34
|
+
self._page = self.start_from_page
|
34
35
|
if isinstance(self.page_size, int) or (self.page_size is None):
|
35
36
|
self._page_size = self.page_size
|
36
37
|
else:
|
@@ -42,30 +43,28 @@ class PageIncrement(PaginationStrategy):
|
|
42
43
|
@property
|
43
44
|
def initial_token(self) -> Optional[Any]:
|
44
45
|
if self.inject_on_first_request:
|
45
|
-
return self.
|
46
|
+
return self._page
|
46
47
|
return None
|
47
48
|
|
48
49
|
def next_page_token(
|
49
|
-
self,
|
50
|
-
response: requests.Response,
|
51
|
-
last_page_size: int,
|
52
|
-
last_record: Optional[Record],
|
53
|
-
last_page_token_value: Optional[Any],
|
50
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
54
51
|
) -> Optional[Any]:
|
55
52
|
# Stop paginating when there are fewer records than the page size or the current page has no records
|
56
53
|
if (self._page_size and last_page_size < self._page_size) or last_page_size == 0:
|
57
54
|
return None
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
55
|
+
else:
|
56
|
+
self._page += 1
|
57
|
+
return self._page
|
58
|
+
|
59
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
60
|
+
if reset_value is None:
|
61
|
+
self._page = self.start_from_page
|
62
|
+
elif not isinstance(reset_value, int):
|
64
63
|
raise ValueError(
|
65
|
-
"
|
64
|
+
f"Reset value {reset_value} for PageIncrement pagination strategy was not an integer"
|
66
65
|
)
|
67
66
|
else:
|
68
|
-
|
67
|
+
self._page = reset_value
|
69
68
|
|
70
69
|
def get_page_size(self) -> Optional[int]:
|
71
70
|
return self._page_size
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from abc import abstractmethod
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import Any,
|
7
|
+
from typing import Any, Optional
|
8
8
|
|
9
9
|
import requests
|
10
10
|
|
@@ -26,21 +26,22 @@ class PaginationStrategy:
|
|
26
26
|
|
27
27
|
@abstractmethod
|
28
28
|
def next_page_token(
|
29
|
-
self,
|
30
|
-
response: requests.Response,
|
31
|
-
last_page_size: int,
|
32
|
-
last_record: Optional[Record],
|
33
|
-
last_page_token_value: Optional[Any],
|
29
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
34
30
|
) -> Optional[Any]:
|
35
31
|
"""
|
36
32
|
:param response: response to process
|
37
33
|
:param last_page_size: the number of records read from the response
|
38
34
|
:param last_record: the last record extracted from the response
|
39
|
-
:param last_page_token_value: The current value of the page token made on the last request
|
40
35
|
:return: next page token. Returns None if there are no more pages to fetch
|
41
36
|
"""
|
42
37
|
pass
|
43
38
|
|
39
|
+
@abstractmethod
|
40
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
41
|
+
"""
|
42
|
+
Reset the pagination's inner state
|
43
|
+
"""
|
44
|
+
|
44
45
|
@abstractmethod
|
45
46
|
def get_page_size(self) -> Optional[int]:
|
46
47
|
"""
|
@@ -44,19 +44,16 @@ class StopConditionPaginationStrategyDecorator(PaginationStrategy):
|
|
44
44
|
self._stop_condition = stop_condition
|
45
45
|
|
46
46
|
def next_page_token(
|
47
|
-
self,
|
48
|
-
response: requests.Response,
|
49
|
-
last_page_size: int,
|
50
|
-
last_record: Optional[Record],
|
51
|
-
last_page_token_value: Optional[Any] = None,
|
47
|
+
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
|
52
48
|
) -> Optional[Any]:
|
53
|
-
# We evaluate in reverse order because the assumption is that most of the APIs using data feed structure
|
54
|
-
#
|
49
|
+
# We evaluate in reverse order because the assumption is that most of the APIs using data feed structure will return records in
|
50
|
+
# descending order. In terms of performance/memory, we return the records lazily
|
55
51
|
if last_record and self._stop_condition.is_met(last_record):
|
56
52
|
return None
|
57
|
-
return self._delegate.next_page_token(
|
58
|
-
|
59
|
-
|
53
|
+
return self._delegate.next_page_token(response, last_page_size, last_record)
|
54
|
+
|
55
|
+
def reset(self, reset_value: Optional[Any] = None) -> None:
|
56
|
+
self._delegate.reset(reset_value)
|
60
57
|
|
61
58
|
def get_page_size(self) -> Optional[int]:
|
62
59
|
return self._delegate.get_page_size()
|
@@ -6,7 +6,18 @@ import json
|
|
6
6
|
from dataclasses import InitVar, dataclass, field
|
7
7
|
from functools import partial
|
8
8
|
from itertools import islice
|
9
|
-
from typing import
|
9
|
+
from typing import (
|
10
|
+
Any,
|
11
|
+
Callable,
|
12
|
+
Iterable,
|
13
|
+
List,
|
14
|
+
Mapping,
|
15
|
+
MutableMapping,
|
16
|
+
Optional,
|
17
|
+
Set,
|
18
|
+
Tuple,
|
19
|
+
Union,
|
20
|
+
)
|
10
21
|
|
11
22
|
import requests
|
12
23
|
|
@@ -79,6 +90,9 @@ class SimpleRetriever(Retriever):
|
|
79
90
|
|
80
91
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
81
92
|
self._paginator = self.paginator or NoPagination(parameters=parameters)
|
93
|
+
self._last_response: Optional[requests.Response] = None
|
94
|
+
self._last_page_size: int = 0
|
95
|
+
self._last_record: Optional[Record] = None
|
82
96
|
self._parameters = parameters
|
83
97
|
self._name = (
|
84
98
|
InterpolatedString(self._name, parameters=parameters)
|
@@ -86,6 +100,10 @@ class SimpleRetriever(Retriever):
|
|
86
100
|
else self._name
|
87
101
|
)
|
88
102
|
|
103
|
+
# This mapping is used during a resumable full refresh syncs to indicate whether a partition has started syncing
|
104
|
+
# records. Partitions serve as the key and map to True if they already began processing records
|
105
|
+
self._partition_started: MutableMapping[Any, bool] = dict()
|
106
|
+
|
89
107
|
@property # type: ignore
|
90
108
|
def name(self) -> str:
|
91
109
|
"""
|
@@ -233,13 +251,17 @@ class SimpleRetriever(Retriever):
|
|
233
251
|
raise ValueError("Request body json cannot be a string")
|
234
252
|
return body_json
|
235
253
|
|
236
|
-
def _paginator_path(
|
254
|
+
def _paginator_path(
|
255
|
+
self,
|
256
|
+
) -> Optional[str]:
|
237
257
|
"""
|
238
258
|
If the paginator points to a path, follow it, else return nothing so the requester is used.
|
259
|
+
:param stream_state:
|
260
|
+
:param stream_slice:
|
239
261
|
:param next_page_token:
|
240
262
|
:return:
|
241
263
|
"""
|
242
|
-
return self._paginator.path(
|
264
|
+
return self._paginator.path()
|
243
265
|
|
244
266
|
def _parse_response(
|
245
267
|
self,
|
@@ -250,15 +272,22 @@ class SimpleRetriever(Retriever):
|
|
250
272
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
251
273
|
) -> Iterable[Record]:
|
252
274
|
if not response:
|
275
|
+
self._last_response = None
|
253
276
|
yield from []
|
254
277
|
else:
|
255
|
-
|
278
|
+
self._last_response = response
|
279
|
+
record_generator = self.record_selector.select_records(
|
256
280
|
response=response,
|
257
281
|
stream_state=stream_state,
|
258
282
|
records_schema=records_schema,
|
259
283
|
stream_slice=stream_slice,
|
260
284
|
next_page_token=next_page_token,
|
261
285
|
)
|
286
|
+
self._last_page_size = 0
|
287
|
+
for record in record_generator:
|
288
|
+
self._last_page_size += 1
|
289
|
+
self._last_record = record
|
290
|
+
yield record
|
262
291
|
|
263
292
|
@property # type: ignore
|
264
293
|
def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
|
@@ -270,13 +299,7 @@ class SimpleRetriever(Retriever):
|
|
270
299
|
if not isinstance(value, property):
|
271
300
|
self._primary_key = value
|
272
301
|
|
273
|
-
def _next_page_token(
|
274
|
-
self,
|
275
|
-
response: requests.Response,
|
276
|
-
last_page_size: int,
|
277
|
-
last_record: Optional[Record],
|
278
|
-
last_page_token_value: Optional[Any],
|
279
|
-
) -> Optional[Mapping[str, Any]]:
|
302
|
+
def _next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
|
280
303
|
"""
|
281
304
|
Specifies a pagination strategy.
|
282
305
|
|
@@ -284,12 +307,7 @@ class SimpleRetriever(Retriever):
|
|
284
307
|
|
285
308
|
:return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
|
286
309
|
"""
|
287
|
-
return self._paginator.next_page_token(
|
288
|
-
response=response,
|
289
|
-
last_page_size=last_page_size,
|
290
|
-
last_record=last_record,
|
291
|
-
last_page_token_value=last_page_token_value,
|
292
|
-
)
|
310
|
+
return self._paginator.next_page_token(response, self._last_page_size, self._last_record)
|
293
311
|
|
294
312
|
def _fetch_next_page(
|
295
313
|
self,
|
@@ -298,7 +316,7 @@ class SimpleRetriever(Retriever):
|
|
298
316
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
299
317
|
) -> Optional[requests.Response]:
|
300
318
|
return self.requester.send_request(
|
301
|
-
path=self._paginator_path(
|
319
|
+
path=self._paginator_path(),
|
302
320
|
stream_state=stream_state,
|
303
321
|
stream_slice=stream_slice,
|
304
322
|
next_page_token=next_page_token,
|
@@ -327,37 +345,20 @@ class SimpleRetriever(Retriever):
|
|
327
345
|
# This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well.
|
328
346
|
def _read_pages(
|
329
347
|
self,
|
330
|
-
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[
|
348
|
+
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
|
331
349
|
stream_state: Mapping[str, Any],
|
332
350
|
stream_slice: StreamSlice,
|
333
|
-
) -> Iterable[
|
351
|
+
) -> Iterable[StreamData]:
|
334
352
|
pagination_complete = False
|
335
|
-
|
336
|
-
next_page_token: Optional[Mapping[str, Any]] = (
|
337
|
-
{"next_page_token": initial_token} if initial_token else None
|
338
|
-
)
|
353
|
+
next_page_token = None
|
339
354
|
while not pagination_complete:
|
340
355
|
response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
|
341
|
-
|
342
|
-
last_page_size = 0
|
343
|
-
last_record: Optional[Record] = None
|
344
|
-
for record in records_generator_fn(response):
|
345
|
-
last_page_size += 1
|
346
|
-
last_record = record
|
347
|
-
yield record
|
356
|
+
yield from records_generator_fn(response)
|
348
357
|
|
349
358
|
if not response:
|
350
359
|
pagination_complete = True
|
351
360
|
else:
|
352
|
-
|
353
|
-
next_page_token.get("next_page_token") if next_page_token else None
|
354
|
-
)
|
355
|
-
next_page_token = self._next_page_token(
|
356
|
-
response=response,
|
357
|
-
last_page_size=last_page_size,
|
358
|
-
last_record=last_record,
|
359
|
-
last_page_token_value=last_page_token_value,
|
360
|
-
)
|
361
|
+
next_page_token = self._next_page_token(response)
|
361
362
|
if not next_page_token:
|
362
363
|
pagination_complete = True
|
363
364
|
|
@@ -366,38 +367,19 @@ class SimpleRetriever(Retriever):
|
|
366
367
|
|
367
368
|
def _read_single_page(
|
368
369
|
self,
|
369
|
-
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[
|
370
|
+
records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
|
370
371
|
stream_state: Mapping[str, Any],
|
371
372
|
stream_slice: StreamSlice,
|
372
373
|
) -> Iterable[StreamData]:
|
373
|
-
|
374
|
-
|
375
|
-
initial_token = self._paginator.get_initial_token()
|
376
|
-
next_page_token: Optional[Mapping[str, Any]] = (
|
377
|
-
{"next_page_token": initial_token} if initial_token else None
|
378
|
-
)
|
379
|
-
|
380
|
-
response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
|
381
|
-
|
382
|
-
last_page_size = 0
|
383
|
-
last_record: Optional[Record] = None
|
384
|
-
for record in records_generator_fn(response):
|
385
|
-
last_page_size += 1
|
386
|
-
last_record = record
|
387
|
-
yield record
|
374
|
+
response = self._fetch_next_page(stream_state, stream_slice)
|
375
|
+
yield from records_generator_fn(response)
|
388
376
|
|
389
377
|
if not response:
|
390
|
-
next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
378
|
+
next_page_token: Mapping[str, Any] = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
391
379
|
else:
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
next_page_token = self._next_page_token(
|
396
|
-
response=response,
|
397
|
-
last_page_size=last_page_size,
|
398
|
-
last_record=last_record,
|
399
|
-
last_page_token_value=last_page_token_value,
|
400
|
-
) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
|
380
|
+
next_page_token = self._next_page_token(response) or {
|
381
|
+
FULL_REFRESH_SYNC_COMPLETE_KEY: True
|
382
|
+
}
|
401
383
|
|
402
384
|
if self.cursor:
|
403
385
|
self.cursor.close_slice(
|
@@ -432,14 +414,25 @@ class SimpleRetriever(Retriever):
|
|
432
414
|
if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
|
433
415
|
stream_state = self.state
|
434
416
|
|
435
|
-
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
436
|
-
#
|
437
|
-
#
|
417
|
+
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to fetch more records
|
418
|
+
# The platform deletes stream state for full refresh streams before starting a new job, so we don't need to worry about
|
419
|
+
# this value existing for the initial attempt
|
438
420
|
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
439
421
|
return
|
422
|
+
cursor_value = stream_state.get("next_page_token")
|
423
|
+
|
424
|
+
# The first attempt to read a page for the current partition should reset the paginator to the current
|
425
|
+
# cursor state which is initially assigned to the incoming state from the platform
|
426
|
+
partition_key = self._to_partition_key(_slice.partition)
|
427
|
+
if partition_key not in self._partition_started:
|
428
|
+
self._partition_started[partition_key] = True
|
429
|
+
self._paginator.reset(reset_value=cursor_value)
|
440
430
|
|
441
431
|
yield from self._read_single_page(record_generator, stream_state, _slice)
|
442
432
|
else:
|
433
|
+
# Fixing paginator types has a long tail of dependencies
|
434
|
+
self._paginator.reset()
|
435
|
+
|
443
436
|
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
444
437
|
current_record = self._extract_record(stream_data, _slice)
|
445
438
|
if self.cursor and current_record:
|
@@ -525,7 +518,7 @@ class SimpleRetriever(Retriever):
|
|
525
518
|
stream_state: Mapping[str, Any],
|
526
519
|
records_schema: Mapping[str, Any],
|
527
520
|
stream_slice: Optional[StreamSlice],
|
528
|
-
) -> Iterable[
|
521
|
+
) -> Iterable[StreamData]:
|
529
522
|
yield from self._parse_response(
|
530
523
|
response,
|
531
524
|
stream_slice=stream_slice,
|
@@ -569,7 +562,7 @@ class SimpleRetrieverTestReadDecorator(SimpleRetriever):
|
|
569
562
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
570
563
|
) -> Optional[requests.Response]:
|
571
564
|
return self.requester.send_request(
|
572
|
-
path=self._paginator_path(
|
565
|
+
path=self._paginator_path(),
|
573
566
|
stream_state=stream_state,
|
574
567
|
stream_slice=stream_slice,
|
575
568
|
next_page_token=next_page_token,
|
@@ -14,6 +14,13 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileB
|
|
14
14
|
from airbyte_cdk.sources.utils import schema_helpers
|
15
15
|
|
16
16
|
|
17
|
+
class DeliveryOptions(BaseModel):
|
18
|
+
preserve_subdirectories_directories: bool = Field(
|
19
|
+
True,
|
20
|
+
description="Flag indicating we should preserve subdirectories directories",
|
21
|
+
)
|
22
|
+
|
23
|
+
|
17
24
|
class DeliverRecords(BaseModel):
|
18
25
|
class Config(OneOfOptionConfig):
|
19
26
|
title = "Replicate Records"
|
@@ -30,6 +37,11 @@ class DeliverRawFiles(BaseModel):
|
|
30
37
|
discriminator = "delivery_type"
|
31
38
|
|
32
39
|
delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
|
40
|
+
delivery_options: Optional[DeliveryOptions] = Field(
|
41
|
+
title="Delivery Options",
|
42
|
+
type="object",
|
43
|
+
order=2,
|
44
|
+
)
|
33
45
|
|
34
46
|
|
35
47
|
class AbstractFileBasedSpec(BaseModel):
|
@@ -65,6 +77,12 @@ class AbstractFileBasedSpec(BaseModel):
|
|
65
77
|
airbyte_hidden=True,
|
66
78
|
)
|
67
79
|
|
80
|
+
delivery_options: Optional[DeliveryOptions] = Field(
|
81
|
+
title="Delivery Options",
|
82
|
+
type="object",
|
83
|
+
order=8,
|
84
|
+
)
|
85
|
+
|
68
86
|
@classmethod
|
69
87
|
@abstractmethod
|
70
88
|
def documentation_url(cls) -> AnyUrl:
|
@@ -111,6 +111,10 @@ class ErrorListingFiles(BaseFileBasedSourceError):
|
|
111
111
|
pass
|
112
112
|
|
113
113
|
|
114
|
+
class DuplicatedFilesError(BaseFileBasedSourceError):
|
115
|
+
pass
|
116
|
+
|
117
|
+
|
114
118
|
class CustomFileBasedException(AirbyteTracedException):
|
115
119
|
"""
|
116
120
|
A specialized exception for file-based connectors.
|
@@ -123,3 +127,25 @@ class CustomFileBasedException(AirbyteTracedException):
|
|
123
127
|
|
124
128
|
class FileSizeLimitError(CustomFileBasedException):
|
125
129
|
pass
|
130
|
+
|
131
|
+
|
132
|
+
def format_duplicate_files_error_message(
|
133
|
+
stream_name: str, duplicated_files_names: List[dict[str, List[str]]]
|
134
|
+
) -> str:
|
135
|
+
duplicated_files_messages = []
|
136
|
+
for duplicated_file in duplicated_files_names:
|
137
|
+
for duplicated_file_name, file_paths in duplicated_file.items():
|
138
|
+
file_duplicated_message = (
|
139
|
+
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
140
|
+
+ "".join(f"\n - {file_paths}")
|
141
|
+
)
|
142
|
+
duplicated_files_messages.append(file_duplicated_message)
|
143
|
+
|
144
|
+
error_message = (
|
145
|
+
f"ERROR: Duplicate filenames found for stream {stream_name}. "
|
146
|
+
"Duplicate file names are not allowed if the Preserve Subdirectories in File Paths option is disabled. "
|
147
|
+
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
148
|
+
+ "\n".join(duplicated_files_messages)
|
149
|
+
)
|
150
|
+
|
151
|
+
return error_message
|
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
242
242
|
stream=self._make_default_stream(
|
243
243
|
stream_config=stream_config,
|
244
244
|
cursor=cursor,
|
245
|
-
|
245
|
+
parsed_config=parsed_config,
|
246
246
|
),
|
247
247
|
source=self,
|
248
248
|
logger=self.logger,
|
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
273
273
|
stream=self._make_default_stream(
|
274
274
|
stream_config=stream_config,
|
275
275
|
cursor=cursor,
|
276
|
-
|
276
|
+
parsed_config=parsed_config,
|
277
277
|
),
|
278
278
|
source=self,
|
279
279
|
logger=self.logger,
|
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
285
285
|
stream = self._make_default_stream(
|
286
286
|
stream_config=stream_config,
|
287
287
|
cursor=cursor,
|
288
|
-
|
288
|
+
parsed_config=parsed_config,
|
289
289
|
)
|
290
290
|
|
291
291
|
streams.append(stream)
|
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
298
298
|
self,
|
299
299
|
stream_config: FileBasedStreamConfig,
|
300
300
|
cursor: Optional[AbstractFileBasedCursor],
|
301
|
-
|
301
|
+
parsed_config: AbstractFileBasedSpec,
|
302
302
|
) -> AbstractFileBasedStream:
|
303
303
|
return DefaultFileBasedStream(
|
304
304
|
config=stream_config,
|
@@ -310,7 +310,10 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
310
310
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
311
311
|
errors_collector=self.errors_collector,
|
312
312
|
cursor=cursor,
|
313
|
-
use_file_transfer=
|
313
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
314
|
+
preserve_subdirectories_directories=self._preserve_subdirectories_directories(
|
315
|
+
parsed_config
|
316
|
+
),
|
314
317
|
)
|
315
318
|
|
316
319
|
def _get_stream_from_catalog(
|
@@ -385,3 +388,14 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
385
388
|
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
386
389
|
)
|
387
390
|
return use_file_transfer
|
391
|
+
|
392
|
+
@staticmethod
|
393
|
+
def _preserve_subdirectories_directories(parsed_config: AbstractFileBasedSpec) -> bool:
|
394
|
+
# fall back to preserve subdirectories if config is not present or incomplete
|
395
|
+
if (
|
396
|
+
hasattr(parsed_config, "delivery_options")
|
397
|
+
and parsed_config.delivery_options is not None
|
398
|
+
and hasattr(parsed_config.delivery_options, "preserve_subdirectories_directories")
|
399
|
+
):
|
400
|
+
return parsed_config.delivery_options.preserve_subdirectories_directories
|
401
|
+
return True
|
@@ -135,6 +135,17 @@ class AbstractFileBasedStreamReader(ABC):
|
|
135
135
|
return use_file_transfer
|
136
136
|
return False
|
137
137
|
|
138
|
+
def preserve_subdirectories_directories(self) -> bool:
|
139
|
+
# fall back to preserve subdirectories if config is not present or incomplete
|
140
|
+
if (
|
141
|
+
self.config
|
142
|
+
and hasattr(self.config, "delivery_options")
|
143
|
+
and self.config.delivery_options is not None
|
144
|
+
and hasattr(self.config.delivery_options, "preserve_subdirectories_directories")
|
145
|
+
):
|
146
|
+
return self.config.delivery_options.preserve_subdirectories_directories
|
147
|
+
return True
|
148
|
+
|
138
149
|
@abstractmethod
|
139
150
|
def get_file(
|
140
151
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
@@ -159,10 +170,13 @@ class AbstractFileBasedStreamReader(ABC):
|
|
159
170
|
"""
|
160
171
|
...
|
161
172
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
173
|
+
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
|
174
|
+
preserve_subdirectories_directories = self.preserve_subdirectories_directories()
|
175
|
+
if preserve_subdirectories_directories:
|
176
|
+
# Remove left slashes from source path format to make relative path for writing locally
|
177
|
+
file_relative_path = file.uri.lstrip("/")
|
178
|
+
else:
|
179
|
+
file_relative_path = path.basename(file.uri)
|
166
180
|
local_file_path = path.join(local_directory, file_relative_path)
|
167
181
|
|
168
182
|
# Ensure the local directory exists
|
@@ -5,20 +5,24 @@
|
|
5
5
|
import asyncio
|
6
6
|
import itertools
|
7
7
|
import traceback
|
8
|
+
from collections import defaultdict
|
8
9
|
from copy import deepcopy
|
9
10
|
from functools import cache
|
10
|
-
from
|
11
|
+
from os import path
|
12
|
+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
|
11
13
|
|
12
14
|
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
|
13
15
|
from airbyte_cdk.models import Type as MessageType
|
14
16
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
15
17
|
from airbyte_cdk.sources.file_based.exceptions import (
|
18
|
+
DuplicatedFilesError,
|
16
19
|
FileBasedSourceError,
|
17
20
|
InvalidSchemaError,
|
18
21
|
MissingSchemaError,
|
19
22
|
RecordParseError,
|
20
23
|
SchemaInferenceError,
|
21
24
|
StopSyncPerValidationPolicy,
|
25
|
+
format_duplicate_files_error_message,
|
22
26
|
)
|
23
27
|
from airbyte_cdk.sources.file_based.file_types import FileTransfer
|
24
28
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -43,6 +47,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
43
47
|
"""
|
44
48
|
|
45
49
|
FILE_TRANSFER_KW = "use_file_transfer"
|
50
|
+
PRESERVE_SUBDIRECTORIES_KW = "preserve_subdirectories_directories"
|
51
|
+
FILES_KEY = "files"
|
46
52
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
47
53
|
ab_last_mod_col = "_ab_source_file_last_modified"
|
48
54
|
ab_file_name_col = "_ab_source_file_url"
|
@@ -50,10 +56,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
50
56
|
source_file_url = "source_file_url"
|
51
57
|
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
52
58
|
use_file_transfer = False
|
59
|
+
preserve_subdirectories_directories = True
|
53
60
|
|
54
61
|
def __init__(self, **kwargs: Any):
|
55
62
|
if self.FILE_TRANSFER_KW in kwargs:
|
56
63
|
self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
|
64
|
+
self.preserve_subdirectories_directories = kwargs.pop(
|
65
|
+
self.PRESERVE_SUBDIRECTORIES_KW, True
|
66
|
+
)
|
57
67
|
super().__init__(**kwargs)
|
58
68
|
|
59
69
|
@property
|
@@ -98,15 +108,43 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
98
108
|
else:
|
99
109
|
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
100
110
|
|
111
|
+
def _duplicated_files_names(
|
112
|
+
self, slices: List[dict[str, List[RemoteFile]]]
|
113
|
+
) -> List[dict[str, List[str]]]:
|
114
|
+
seen_file_names = set()
|
115
|
+
duplicates_file_names = set()
|
116
|
+
file_paths = defaultdict(list)
|
117
|
+
for file_slice in slices:
|
118
|
+
for file_found in file_slice[self.FILES_KEY]:
|
119
|
+
file_name = path.basename(file_found.uri)
|
120
|
+
if file_name not in seen_file_names:
|
121
|
+
seen_file_names.add(file_name)
|
122
|
+
else:
|
123
|
+
duplicates_file_names.add(file_name)
|
124
|
+
file_paths[file_name].append(file_found.uri)
|
125
|
+
return [
|
126
|
+
{duplicated_file: file_paths[duplicated_file]}
|
127
|
+
for duplicated_file in duplicates_file_names
|
128
|
+
]
|
129
|
+
|
101
130
|
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
102
131
|
# Sort files by last_modified, uri and return them grouped by last_modified
|
103
132
|
all_files = self.list_files()
|
104
133
|
files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
|
105
134
|
sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
|
106
135
|
slices = [
|
107
|
-
{
|
136
|
+
{self.FILES_KEY: list(group[1])}
|
108
137
|
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
109
138
|
]
|
139
|
+
if slices and not self.preserve_subdirectories_directories:
|
140
|
+
duplicated_files_names = self._duplicated_files_names(slices)
|
141
|
+
if duplicated_files_names:
|
142
|
+
raise DuplicatedFilesError(
|
143
|
+
format_duplicate_files_error_message(
|
144
|
+
stream_name=self.name, duplicated_files_names=duplicated_files_names
|
145
|
+
),
|
146
|
+
stream=self.name,
|
147
|
+
)
|
110
148
|
return slices
|
111
149
|
|
112
150
|
def transform_record(
|
@@ -62,7 +62,7 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
|
|
62
62
|
airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
|
63
63
|
airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
|
64
64
|
airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
|
65
|
-
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=
|
65
|
+
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=PxP4p2686wsf1gjsumGKnh2o2Jjnrqg8QLGijEIrp-A,23412
|
66
66
|
airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
|
67
67
|
airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
|
68
68
|
airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
|
@@ -135,15 +135,15 @@ airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.p
|
|
135
135
|
airbyte_cdk/sources/declarative/requesters/http_job_repository.py,sha256=o0520AmHMb7SAoeokVNwoOzuZzIAT6ryx9uFYGSOrs0,8664
|
136
136
|
airbyte_cdk/sources/declarative/requesters/http_requester.py,sha256=RqYPkgJFAWfcZBTc-JBcGHPm4JL1ZQOhs9GKU4MP2eE,14723
|
137
137
|
airbyte_cdk/sources/declarative/requesters/paginators/__init__.py,sha256=uArbKs9JKNCt7t9tZoeWwjDpyI1HoPp29FNW0JzvaEM,644
|
138
|
-
airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py,sha256=
|
139
|
-
airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py,sha256
|
140
|
-
airbyte_cdk/sources/declarative/requesters/paginators/paginator.py,sha256=
|
138
|
+
airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py,sha256=LxTq1hieznRWlYlfODdZbMDUml-g6NyBkdwVI2mCNMM,10910
|
139
|
+
airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py,sha256=-P-QOlefFhEe99bsB2y3yTvA8c8kCCbfBaTS6qPvF6I,1927
|
140
|
+
airbyte_cdk/sources/declarative/requesters/paginators/paginator.py,sha256=ZgyvH7DOrASQ5K__J5SRAXH3REUW2n3yPHnFW9xq4NU,1972
|
141
141
|
airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py,sha256=2gly8fuZpDNwtu1Qg6oE2jBLGqQRdzSLJdnpk_iDV6I,767
|
142
|
-
airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py,sha256=
|
143
|
-
airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py,sha256=
|
144
|
-
airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py,sha256=
|
145
|
-
airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py,sha256=
|
146
|
-
airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py,sha256
|
142
|
+
airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py,sha256=vFzpNv8BdgXrYO5qhi2_Un4x4y-EAQWxinZtEPWz5KI,3654
|
143
|
+
airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py,sha256=pMPi6iQrhtrI9VRPj218QNM_OqD8lX8P3Tr9yloKoE8,3367
|
144
|
+
airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py,sha256=kQGpfr-dOwarxTIf2S4sHVulBzm8zSwQXBM7rOhkafA,2491
|
145
|
+
airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py,sha256=ABpO4t0UUziBZnyml8UT_NhlF6loekhQji57TpKnaiY,1290
|
146
|
+
airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py,sha256=-8NwokW-aKwv8DdeHh1ssODTobBYSOmIhH2-IjSjlNA,2213
|
147
147
|
airbyte_cdk/sources/declarative/requesters/request_option.py,sha256=_qmv8CLQQ3fERt6BuMZeRu6tZXscPoeARx1VJdWMQ_M,1055
|
148
148
|
airbyte_cdk/sources/declarative/requesters/request_options/__init__.py,sha256=WCwpKqM4wKqy-DHJaCHbKAlFqRVOqMi9K5qonxIfi_Y,809
|
149
149
|
airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py,sha256=FLkg0uzC9bc-zFnALWr0FLYpKsz8iK2xQsd4UOyeW08,3706
|
@@ -161,7 +161,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
|
|
161
161
|
airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
|
162
162
|
airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=_-d3MvHh-4r46i4wjQikD4ZygKA7TvuDu2i04qqULEg,3731
|
163
163
|
airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
|
164
|
-
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=
|
164
|
+
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=N4swGw5mfuTXJ2R7AKX18CHzizsr69pXwt5uSHLPi48,24172
|
165
165
|
airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
|
166
166
|
airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
|
167
167
|
airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
|
@@ -193,7 +193,7 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=ddKQfUmk
|
|
193
193
|
airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=01Nd4b7ERAbp-OZo_8rrAzFXWPTMwr02SnWiN17nx8Q,2363
|
194
194
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=j9T5TimfWFUz7nqsaj-83G3xWmDpsmeSbDnaUNmz0UM,5849
|
195
195
|
airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
196
|
-
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=
|
196
|
+
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=1V7NpVdcVA7QNF_mI0uH5X3WqBvrS6U-W1iHoTiEDtY,6926
|
197
197
|
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=NxTF96ewzn6HuhgodsY7Rpb-ybr1ZEWW5d4Vid64g5A,716
|
198
198
|
airbyte_cdk/sources/file_based/config/csv_format.py,sha256=NWekkyT8dTwiVK0mwa_krQD4FJPHSDfILo8kPAg3-Vs,8006
|
199
199
|
airbyte_cdk/sources/file_based/config/excel_format.py,sha256=9qAmTsT6SoVzNfNv0oBVkVCmiyqQuVAbfRKajjoa7Js,378
|
@@ -204,9 +204,9 @@ airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=tIbB9Pn1HqU6
|
|
204
204
|
airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=gl3ey6mZbyfraB9P3pFhf9UJp2JeTZ1SUFAopy2iBvY,301
|
205
205
|
airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=dCfXX529Rd5rtopg4VeEgTPJjFtqjtjzPq6LCw18Wt0,605
|
206
206
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=-xujTidtrq6HC00WKbjQh1CZdT5LMuzkp5BLjqDmfTY,1007
|
207
|
-
airbyte_cdk/sources/file_based/exceptions.py,sha256=
|
208
|
-
airbyte_cdk/sources/file_based/file_based_source.py,sha256=
|
209
|
-
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=
|
207
|
+
airbyte_cdk/sources/file_based/exceptions.py,sha256=KfOgQgssBKgsv3h5po2IG1DhZcH664Zf_fx96mBlPSg,6761
|
208
|
+
airbyte_cdk/sources/file_based/file_based_source.py,sha256=Z-1EQS8_gczgBgBHJV75LCwHOtXgonEqxNpXmNmqVTM,17263
|
209
|
+
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=Uwg_e9wHdJ2UaWxHcW2Ph6S4sHDhB0_MoyCEW7neVdA,6888
|
210
210
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=blCLn0-2LC-ZdgcNyDEhqM2RiUvEjEBh-G4-t32ZtuM,1268
|
211
211
|
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=XNx-JC-sgzH9u3nOJ2M59FxBXvtig8LN6BIkeDOavZA,10858
|
212
212
|
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=QlCXB-ry3np67Q_VerQEPoWDOTcPTB6Go4ydZxY9ae4,20445
|
@@ -232,7 +232,7 @@ airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_c
|
|
232
232
|
airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
|
233
233
|
airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=om-x3gZFPgWDpi15S9RxZmR36VHnk8sytgN6LlBQhAw,1934
|
234
234
|
airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=VGV7xLyBribuBMVrXtO1xqkWJD86bl7yhXtjnwLMohM,7051
|
235
|
-
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=
|
235
|
+
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=HRjR0rQGc8cYK2PxpLgAvZQ--jvtV8QgS1QIxkemnko,18413
|
236
236
|
airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
|
237
237
|
airbyte_cdk/sources/http_config.py,sha256=OBZeuyFilm6NlDlBhFQvHhTWabEvZww6OHDIlZujIS0,730
|
238
238
|
airbyte_cdk/sources/http_logger.py,sha256=TyBmtRA6D9g0XDkKGvdM415b36RXDjgfkwRewDsH8-0,1576
|
@@ -340,8 +340,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
|
|
340
340
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
|
341
341
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
342
342
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
343
|
-
airbyte_cdk-6.13.
|
344
|
-
airbyte_cdk-6.13.
|
345
|
-
airbyte_cdk-6.13.
|
346
|
-
airbyte_cdk-6.13.
|
347
|
-
airbyte_cdk-6.13.
|
343
|
+
airbyte_cdk-6.13.1.dev4100.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
344
|
+
airbyte_cdk-6.13.1.dev4100.dist-info/METADATA,sha256=P_PnklFtJqLdDdAxsiOLX7Ydkee8d7jz9m_v2XNCRAI,5996
|
345
|
+
airbyte_cdk-6.13.1.dev4100.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
346
|
+
airbyte_cdk-6.13.1.dev4100.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
|
347
|
+
airbyte_cdk-6.13.1.dev4100.dist-info/RECORD,,
|
File without changes
|
File without changes
|