airbyte-cdk 6.34.0.dev0__py3-none-any.whl → 6.34.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +16 -12
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +591 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +160 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +75 -0
- airbyte_cdk/entrypoint.py +6 -6
- airbyte_cdk/logger.py +1 -4
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +122 -38
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +5 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +10 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/file_based_source.py +70 -37
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +107 -12
- airbyte_cdk/sources/file_based/stream/__init__.py +10 -1
- airbyte_cdk/sources/file_based/stream/identities_stream.py +47 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +85 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/test/mock_http/mocker.py +9 -1
- airbyte_cdk/test/mock_http/response.py +6 -3
- airbyte_cdk/utils/mapping_helpers.py +43 -2
- airbyte_cdk/utils/print_buffer.py +0 -4
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/RECORD +30 -21
- airbyte_cdk/connector_builder/message_grouper.py +0 -448
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/entry_points.txt +0 -0
airbyte_cdk/entrypoint.py
CHANGED
@@ -22,7 +22,7 @@ from requests import PreparedRequest, Response, Session
|
|
22
22
|
|
23
23
|
from airbyte_cdk.connector import TConfig
|
24
24
|
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
25
|
-
from airbyte_cdk.logger import
|
25
|
+
from airbyte_cdk.logger import init_logger
|
26
26
|
from airbyte_cdk.models import (
|
27
27
|
AirbyteConnectionStatus,
|
28
28
|
AirbyteMessage,
|
@@ -337,11 +337,11 @@ def launch(source: Source, args: List[str]) -> None:
|
|
337
337
|
parsed_args = source_entrypoint.parse_args(args)
|
338
338
|
# temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
|
339
339
|
# Refer to: https://github.com/airbytehq/oncall/issues/6235
|
340
|
-
with
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
340
|
+
# with PrintBuffer():
|
341
|
+
for message in source_entrypoint.run(parsed_args):
|
342
|
+
# simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
|
343
|
+
# the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
|
344
|
+
print(f"{message}\n", end="", flush=True)
|
345
345
|
|
346
346
|
|
347
347
|
def _init_internal_request_filter() -> None:
|
airbyte_cdk/logger.py
CHANGED
@@ -16,11 +16,8 @@ from airbyte_cdk.models import (
|
|
16
16
|
Level,
|
17
17
|
Type,
|
18
18
|
)
|
19
|
-
from airbyte_cdk.utils import PrintBuffer
|
20
19
|
from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
21
20
|
|
22
|
-
PRINT_BUFFER = PrintBuffer(flush_interval=0.1)
|
23
|
-
|
24
21
|
LOGGING_CONFIG = {
|
25
22
|
"version": 1,
|
26
23
|
"disable_existing_loggers": False,
|
@@ -30,7 +27,7 @@ LOGGING_CONFIG = {
|
|
30
27
|
"handlers": {
|
31
28
|
"console": {
|
32
29
|
"class": "logging.StreamHandler",
|
33
|
-
"stream":
|
30
|
+
"stream": "ext://sys.stdout",
|
34
31
|
"formatter": "airbyte",
|
35
32
|
},
|
36
33
|
},
|
@@ -5,6 +5,7 @@
|
|
5
5
|
import copy
|
6
6
|
import logging
|
7
7
|
import threading
|
8
|
+
import time
|
8
9
|
from collections import OrderedDict
|
9
10
|
from copy import deepcopy
|
10
11
|
from datetime import timedelta
|
@@ -58,7 +59,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
58
59
|
CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
|
59
60
|
"""
|
60
61
|
|
61
|
-
DEFAULT_MAX_PARTITIONS_NUMBER =
|
62
|
+
DEFAULT_MAX_PARTITIONS_NUMBER = 25_000
|
63
|
+
SWITCH_TO_GLOBAL_LIMIT = 10_000
|
62
64
|
_NO_STATE: Mapping[str, Any] = {}
|
63
65
|
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
64
66
|
_GLOBAL_STATE_KEY = "state"
|
@@ -93,15 +95,21 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
93
95
|
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
94
96
|
self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
|
95
97
|
self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
|
98
|
+
|
99
|
+
# Parent-state tracking: store each partition’s parent state in creation order
|
100
|
+
self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
|
101
|
+
|
96
102
|
self._finished_partitions: set[str] = set()
|
97
103
|
self._lock = threading.Lock()
|
98
104
|
self._timer = Timer()
|
99
105
|
self._new_global_cursor: Optional[StreamState] = None
|
100
106
|
self._lookback_window: int = 0
|
101
107
|
self._parent_state: Optional[StreamState] = None
|
102
|
-
self.
|
108
|
+
self._number_of_partitions: int = 0
|
103
109
|
self._use_global_cursor: bool = False
|
104
110
|
self._partition_serializer = PerPartitionKeySerializer()
|
111
|
+
# Track the last time a state message was emitted
|
112
|
+
self._last_emission_time: float = 0.0
|
105
113
|
|
106
114
|
self._set_initial_state(stream_state)
|
107
115
|
|
@@ -141,22 +149,55 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
141
149
|
raise ValueError("stream_slice cannot be None")
|
142
150
|
|
143
151
|
partition_key = self._to_partition_key(stream_slice.partition)
|
144
|
-
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
145
152
|
with self._lock:
|
146
153
|
self._semaphore_per_partition[partition_key].acquire()
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
and self._semaphore_per_partition[partition_key]._value == 0
|
151
|
-
):
|
154
|
+
if not self._use_global_cursor:
|
155
|
+
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
156
|
+
cursor = self._cursor_per_partition[partition_key]
|
152
157
|
if (
|
153
|
-
self.
|
154
|
-
|
155
|
-
< cursor.state[self.cursor_field.cursor_field_key]
|
158
|
+
partition_key in self._finished_partitions
|
159
|
+
and self._semaphore_per_partition[partition_key]._value == 0
|
156
160
|
):
|
157
|
-
self.
|
158
|
-
|
159
|
-
|
161
|
+
self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
|
162
|
+
|
163
|
+
self._check_and_update_parent_state()
|
164
|
+
|
165
|
+
self._emit_state_message()
|
166
|
+
|
167
|
+
def _check_and_update_parent_state(self) -> None:
|
168
|
+
"""
|
169
|
+
Pop the leftmost partition state from _partition_parent_state_map only if
|
170
|
+
*all partitions* up to (and including) that partition key in _semaphore_per_partition
|
171
|
+
are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
|
172
|
+
"""
|
173
|
+
last_closed_state = None
|
174
|
+
|
175
|
+
while self._partition_parent_state_map:
|
176
|
+
# Look at the earliest partition key in creation order
|
177
|
+
earliest_key = next(iter(self._partition_parent_state_map))
|
178
|
+
|
179
|
+
# Verify ALL partitions from the left up to earliest_key are finished
|
180
|
+
all_left_finished = True
|
181
|
+
for p_key, sem in self._semaphore_per_partition.items():
|
182
|
+
# If any earlier partition is still not finished, we must stop
|
183
|
+
if p_key not in self._finished_partitions or sem._value != 0:
|
184
|
+
all_left_finished = False
|
185
|
+
break
|
186
|
+
# Once we've reached earliest_key in the semaphore order, we can stop checking
|
187
|
+
if p_key == earliest_key:
|
188
|
+
break
|
189
|
+
|
190
|
+
# If the partitions up to earliest_key are not all finished, break the while-loop
|
191
|
+
if not all_left_finished:
|
192
|
+
break
|
193
|
+
|
194
|
+
# Otherwise, pop the leftmost entry from parent-state map
|
195
|
+
_, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
|
196
|
+
last_closed_state = closed_parent_state
|
197
|
+
|
198
|
+
# Update _parent_state if we actually popped at least one partition
|
199
|
+
if last_closed_state is not None:
|
200
|
+
self._parent_state = last_closed_state
|
160
201
|
|
161
202
|
def ensure_at_least_one_state_emitted(self) -> None:
|
162
203
|
"""
|
@@ -169,9 +210,23 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
169
210
|
self._global_cursor = self._new_global_cursor
|
170
211
|
self._lookback_window = self._timer.finish()
|
171
212
|
self._parent_state = self._partition_router.get_stream_state()
|
172
|
-
self._emit_state_message()
|
213
|
+
self._emit_state_message(throttle=False)
|
173
214
|
|
174
|
-
def
|
215
|
+
def _throttle_state_message(self) -> Optional[float]:
|
216
|
+
"""
|
217
|
+
Throttles the state message emission to once every 60 seconds.
|
218
|
+
"""
|
219
|
+
current_time = time.time()
|
220
|
+
if current_time - self._last_emission_time <= 60:
|
221
|
+
return None
|
222
|
+
return current_time
|
223
|
+
|
224
|
+
def _emit_state_message(self, throttle: bool = True) -> None:
|
225
|
+
if throttle:
|
226
|
+
current_time = self._throttle_state_message()
|
227
|
+
if current_time is None:
|
228
|
+
return
|
229
|
+
self._last_emission_time = current_time
|
175
230
|
self._connector_state_manager.update_state_for_stream(
|
176
231
|
self._stream_name,
|
177
232
|
self._stream_namespace,
|
@@ -188,13 +243,19 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
188
243
|
|
189
244
|
slices = self._partition_router.stream_slices()
|
190
245
|
self._timer.start()
|
191
|
-
for partition in
|
192
|
-
|
246
|
+
for partition, last, parent_state in iterate_with_last_flag_and_state(
|
247
|
+
slices, self._partition_router.get_stream_state
|
248
|
+
):
|
249
|
+
yield from self._generate_slices_from_partition(partition, parent_state)
|
193
250
|
|
194
|
-
def _generate_slices_from_partition(
|
251
|
+
def _generate_slices_from_partition(
|
252
|
+
self, partition: StreamSlice, parent_state: Mapping[str, Any]
|
253
|
+
) -> Iterable[StreamSlice]:
|
195
254
|
# Ensure the maximum number of partitions is not exceeded
|
196
255
|
self._ensure_partition_limit()
|
197
256
|
|
257
|
+
partition_key = self._to_partition_key(partition.partition)
|
258
|
+
|
198
259
|
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
199
260
|
if not cursor:
|
200
261
|
cursor = self._create_cursor(
|
@@ -202,18 +263,27 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
202
263
|
self._lookback_window if self._global_cursor else 0,
|
203
264
|
)
|
204
265
|
with self._lock:
|
205
|
-
self.
|
206
|
-
|
207
|
-
|
208
|
-
|
266
|
+
self._number_of_partitions += 1
|
267
|
+
self._cursor_per_partition[partition_key] = cursor
|
268
|
+
self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
|
269
|
+
|
270
|
+
with self._lock:
|
271
|
+
if (
|
272
|
+
len(self._partition_parent_state_map) == 0
|
273
|
+
or self._partition_parent_state_map[
|
274
|
+
next(reversed(self._partition_parent_state_map))
|
275
|
+
]
|
276
|
+
!= parent_state
|
277
|
+
):
|
278
|
+
self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
|
209
279
|
|
210
280
|
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
211
281
|
cursor.stream_slices(),
|
212
282
|
lambda: None,
|
213
283
|
):
|
214
|
-
self._semaphore_per_partition[
|
284
|
+
self._semaphore_per_partition[partition_key].release()
|
215
285
|
if is_last_slice:
|
216
|
-
self._finished_partitions.add(
|
286
|
+
self._finished_partitions.add(partition_key)
|
217
287
|
yield StreamSlice(
|
218
288
|
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
219
289
|
)
|
@@ -232,9 +302,15 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
232
302
|
- Logs a warning each time a partition is removed, indicating whether it was finished
|
233
303
|
or removed due to being the oldest.
|
234
304
|
"""
|
305
|
+
if not self._use_global_cursor and self.limit_reached():
|
306
|
+
logger.info(
|
307
|
+
f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. "
|
308
|
+
f"Switching to global cursor for {self._stream_name}."
|
309
|
+
)
|
310
|
+
self._use_global_cursor = True
|
311
|
+
|
235
312
|
with self._lock:
|
236
313
|
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
237
|
-
self._over_limit += 1
|
238
314
|
# Try removing finished partitions first
|
239
315
|
for partition_key in list(self._cursor_per_partition.keys()):
|
240
316
|
if (
|
@@ -245,7 +321,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
245
321
|
partition_key
|
246
322
|
) # Remove the oldest partition
|
247
323
|
logger.warning(
|
248
|
-
f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self.
|
324
|
+
f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
|
249
325
|
)
|
250
326
|
break
|
251
327
|
else:
|
@@ -254,7 +330,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
254
330
|
1
|
255
331
|
] # Remove the oldest partition
|
256
332
|
logger.warning(
|
257
|
-
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self.
|
333
|
+
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
|
258
334
|
)
|
259
335
|
|
260
336
|
def _set_initial_state(self, stream_state: StreamState) -> None:
|
@@ -314,12 +390,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
314
390
|
self._lookback_window = int(stream_state.get("lookback_window", 0))
|
315
391
|
|
316
392
|
for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
|
393
|
+
self._number_of_partitions += 1
|
317
394
|
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
318
395
|
self._create_cursor(state["cursor"])
|
319
396
|
)
|
320
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
321
|
-
threading.Semaphore(0)
|
322
|
-
)
|
323
397
|
|
324
398
|
# set default state for missing partitions if it is per partition with fallback to global
|
325
399
|
if self._GLOBAL_STATE_KEY in stream_state:
|
@@ -354,16 +428,26 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
354
428
|
self._new_global_cursor = deepcopy(fixed_global_state)
|
355
429
|
|
356
430
|
def observe(self, record: Record) -> None:
|
357
|
-
if not self._use_global_cursor and self.limit_reached():
|
358
|
-
self._use_global_cursor = True
|
359
|
-
|
360
431
|
if not record.associated_slice:
|
361
432
|
raise ValueError(
|
362
433
|
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
363
434
|
)
|
364
|
-
|
365
|
-
|
366
|
-
|
435
|
+
|
436
|
+
record_cursor = self._connector_state_converter.output_format(
|
437
|
+
self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
438
|
+
)
|
439
|
+
self._update_global_cursor(record_cursor)
|
440
|
+
if not self._use_global_cursor:
|
441
|
+
self._cursor_per_partition[
|
442
|
+
self._to_partition_key(record.associated_slice.partition)
|
443
|
+
].observe(record)
|
444
|
+
|
445
|
+
def _update_global_cursor(self, value: Any) -> None:
|
446
|
+
if (
|
447
|
+
self._new_global_cursor is None
|
448
|
+
or self._new_global_cursor[self.cursor_field.cursor_field_key] < value
|
449
|
+
):
|
450
|
+
self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)}
|
367
451
|
|
368
452
|
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
369
453
|
return self._partition_serializer.to_partition_key(partition)
|
@@ -397,4 +481,4 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
397
481
|
return cursor
|
398
482
|
|
399
483
|
def limit_reached(self) -> bool:
|
400
|
-
return self.
|
484
|
+
return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
|
@@ -21,6 +21,7 @@ from airbyte_cdk.sources.declarative.requesters.request_option import (
|
|
21
21
|
)
|
22
22
|
from airbyte_cdk.sources.message import MessageRepository
|
23
23
|
from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
|
24
|
+
from airbyte_cdk.utils.mapping_helpers import _validate_component_request_option_paths
|
24
25
|
|
25
26
|
|
26
27
|
@dataclass
|
@@ -122,6 +123,10 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
122
123
|
if not self.cursor_datetime_formats:
|
123
124
|
self.cursor_datetime_formats = [self.datetime_format]
|
124
125
|
|
126
|
+
_validate_component_request_option_paths(
|
127
|
+
self.config, self.start_time_option, self.end_time_option
|
128
|
+
)
|
129
|
+
|
125
130
|
def get_stream_state(self) -> StreamState:
|
126
131
|
return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
127
132
|
|
@@ -23,6 +23,9 @@ from airbyte_cdk.sources.declarative.requesters.request_option import (
|
|
23
23
|
)
|
24
24
|
from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
|
25
25
|
from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
|
26
|
+
from airbyte_cdk.utils.mapping_helpers import (
|
27
|
+
_validate_component_request_option_paths,
|
28
|
+
)
|
26
29
|
|
27
30
|
|
28
31
|
@dataclass
|
@@ -113,6 +116,13 @@ class DefaultPaginator(Paginator):
|
|
113
116
|
if isinstance(self.url_base, str):
|
114
117
|
self.url_base = InterpolatedString(string=self.url_base, parameters=parameters)
|
115
118
|
|
119
|
+
if self.page_token_option and not isinstance(self.page_token_option, RequestPath):
|
120
|
+
_validate_component_request_option_paths(
|
121
|
+
self.config,
|
122
|
+
self.page_size_option,
|
123
|
+
self.page_token_option,
|
124
|
+
)
|
125
|
+
|
116
126
|
def get_initial_token(self) -> Optional[Any]:
|
117
127
|
"""
|
118
128
|
Return the page token that should be used for the first request of a stream
|
@@ -11,6 +11,7 @@ from pydantic.v1 import AnyUrl, BaseModel, Field
|
|
11
11
|
|
12
12
|
from airbyte_cdk import OneOfOptionConfig
|
13
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
14
|
+
from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
|
14
15
|
from airbyte_cdk.sources.utils import schema_helpers
|
15
16
|
|
16
17
|
|
@@ -65,7 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
|
|
65
66
|
order=10,
|
66
67
|
)
|
67
68
|
|
68
|
-
delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field(
|
69
|
+
delivery_method: Union[DeliverRecords, DeliverRawFiles, DeliverPermissions] = Field(
|
69
70
|
title="Delivery Method",
|
70
71
|
discriminator="delivery_type",
|
71
72
|
type="object",
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
|
6
|
+
AbstractFileBasedSpec,
|
7
|
+
DeliverRawFiles,
|
8
|
+
)
|
9
|
+
from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
|
10
|
+
|
11
|
+
DELIVERY_TYPE_KEY = "delivery_type"
|
12
|
+
DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE = "use_permissions_transfer"
|
13
|
+
DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE = "use_file_transfer"
|
14
|
+
PRESERVE_DIRECTORY_STRUCTURE_KEY = "preserve_directory_structure"
|
15
|
+
INCLUDE_IDENTITIES_STREAM_KEY = "include_identities_stream"
|
16
|
+
|
17
|
+
|
18
|
+
def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
19
|
+
"""Returns `True` if the configuration uses file transfer mode."""
|
20
|
+
return (
|
21
|
+
hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
|
22
|
+
and parsed_config.delivery_method.delivery_type == DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
27
|
+
"""
|
28
|
+
Determines whether to preserve directory structure during file transfer.
|
29
|
+
|
30
|
+
When enabled, files maintain their subdirectory paths in the destination.
|
31
|
+
When disabled, files are flattened to the root of the destination.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
parsed_config: The parsed configuration containing delivery method settings
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
True if directory structure should be preserved (default), False otherwise
|
38
|
+
"""
|
39
|
+
if (
|
40
|
+
use_file_transfer(parsed_config)
|
41
|
+
and hasattr(parsed_config.delivery_method, PRESERVE_DIRECTORY_STRUCTURE_KEY)
|
42
|
+
and isinstance(parsed_config.delivery_method, DeliverRawFiles)
|
43
|
+
):
|
44
|
+
return parsed_config.delivery_method.preserve_directory_structure
|
45
|
+
return True
|
46
|
+
|
47
|
+
|
48
|
+
def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
49
|
+
"""
|
50
|
+
Determines whether to use permissions transfer to sync ACLs and Identities
|
51
|
+
|
52
|
+
Args:
|
53
|
+
parsed_config: The parsed configuration containing delivery method settings
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
True if permissions transfer should be enabled, False otherwise
|
57
|
+
"""
|
58
|
+
return (
|
59
|
+
hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
|
60
|
+
and parsed_config.delivery_method.delivery_type
|
61
|
+
== DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE
|
62
|
+
)
|
63
|
+
|
64
|
+
|
65
|
+
def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool:
|
66
|
+
"""
|
67
|
+
There are scenarios where user may not have access to identities but still is valuable to get ACLs
|
68
|
+
|
69
|
+
Args:
|
70
|
+
parsed_config: The parsed configuration containing delivery method settings
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
True if we should include Identities stream.
|
74
|
+
"""
|
75
|
+
if (
|
76
|
+
use_permissions_transfer(parsed_config)
|
77
|
+
and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY)
|
78
|
+
and isinstance(parsed_config.delivery_method, DeliverPermissions)
|
79
|
+
):
|
80
|
+
return parsed_config.delivery_method.include_identities_stream
|
81
|
+
return False
|
@@ -33,6 +33,12 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
|
33
33
|
FileBasedStreamConfig,
|
34
34
|
ValidationPolicy,
|
35
35
|
)
|
36
|
+
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
37
|
+
include_identities_stream,
|
38
|
+
preserve_directory_structure,
|
39
|
+
use_file_transfer,
|
40
|
+
use_permissions_transfer,
|
41
|
+
)
|
36
42
|
from airbyte_cdk.sources.file_based.discovery_policy import (
|
37
43
|
AbstractDiscoveryPolicy,
|
38
44
|
DefaultDiscoveryPolicy,
|
@@ -49,7 +55,12 @@ from airbyte_cdk.sources.file_based.schema_validation_policies import (
|
|
49
55
|
DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
50
56
|
AbstractSchemaValidationPolicy,
|
51
57
|
)
|
52
|
-
from airbyte_cdk.sources.file_based.stream import
|
58
|
+
from airbyte_cdk.sources.file_based.stream import (
|
59
|
+
AbstractFileBasedStream,
|
60
|
+
DefaultFileBasedStream,
|
61
|
+
FileIdentitiesStream,
|
62
|
+
PermissionsFileBasedStream,
|
63
|
+
)
|
53
64
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
54
65
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
55
66
|
AbstractConcurrentFileBasedCursor,
|
@@ -66,6 +77,7 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
66
77
|
DEFAULT_CONCURRENCY = 100
|
67
78
|
MAX_CONCURRENCY = 100
|
68
79
|
INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2
|
80
|
+
IDENTITIES_STREAM = "identities"
|
69
81
|
|
70
82
|
|
71
83
|
class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
@@ -157,13 +169,20 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
157
169
|
errors = []
|
158
170
|
tracebacks = []
|
159
171
|
for stream in streams:
|
172
|
+
if isinstance(stream, FileIdentitiesStream):
|
173
|
+
identity = next(iter(stream.load_identity_groups()))
|
174
|
+
if not identity:
|
175
|
+
errors.append(
|
176
|
+
"Unable to get identities for current configuration, please check your credentials"
|
177
|
+
)
|
178
|
+
continue
|
160
179
|
if not isinstance(stream, AbstractFileBasedStream):
|
161
180
|
raise ValueError(f"Stream {stream} is not a file-based stream.")
|
162
181
|
try:
|
163
182
|
parsed_config = self._get_parsed_config(config)
|
164
183
|
availability_method = (
|
165
184
|
stream.availability_strategy.check_availability
|
166
|
-
if
|
185
|
+
if use_file_transfer(parsed_config) or use_permissions_transfer(parsed_config)
|
167
186
|
else stream.availability_strategy.check_availability_and_parsability
|
168
187
|
)
|
169
188
|
(
|
@@ -239,7 +258,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
239
258
|
message_repository=self.message_repository,
|
240
259
|
)
|
241
260
|
stream = FileBasedStreamFacade.create_from_stream(
|
242
|
-
stream=self.
|
261
|
+
stream=self._make_file_based_stream(
|
243
262
|
stream_config=stream_config,
|
244
263
|
cursor=cursor,
|
245
264
|
parsed_config=parsed_config,
|
@@ -270,7 +289,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
270
289
|
CursorField(DefaultFileBasedStream.ab_last_mod_col),
|
271
290
|
)
|
272
291
|
stream = FileBasedStreamFacade.create_from_stream(
|
273
|
-
stream=self.
|
292
|
+
stream=self._make_file_based_stream(
|
274
293
|
stream_config=stream_config,
|
275
294
|
cursor=cursor,
|
276
295
|
parsed_config=parsed_config,
|
@@ -282,13 +301,17 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
282
301
|
)
|
283
302
|
else:
|
284
303
|
cursor = self.cursor_cls(stream_config)
|
285
|
-
stream = self.
|
304
|
+
stream = self._make_file_based_stream(
|
286
305
|
stream_config=stream_config,
|
287
306
|
cursor=cursor,
|
288
307
|
parsed_config=parsed_config,
|
289
308
|
)
|
290
309
|
|
291
310
|
streams.append(stream)
|
311
|
+
|
312
|
+
if include_identities_stream(parsed_config):
|
313
|
+
identities_stream = self._make_identities_stream()
|
314
|
+
streams.append(identities_stream)
|
292
315
|
return streams
|
293
316
|
|
294
317
|
except ValidationError as exc:
|
@@ -310,8 +333,48 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
310
333
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
311
334
|
errors_collector=self.errors_collector,
|
312
335
|
cursor=cursor,
|
313
|
-
use_file_transfer=
|
314
|
-
preserve_directory_structure=
|
336
|
+
use_file_transfer=use_file_transfer(parsed_config),
|
337
|
+
preserve_directory_structure=preserve_directory_structure(parsed_config),
|
338
|
+
)
|
339
|
+
|
340
|
+
def _make_permissions_stream(
|
341
|
+
self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
|
342
|
+
) -> AbstractFileBasedStream:
|
343
|
+
return PermissionsFileBasedStream(
|
344
|
+
config=stream_config,
|
345
|
+
catalog_schema=self.stream_schemas.get(stream_config.name),
|
346
|
+
stream_reader=self.stream_reader,
|
347
|
+
availability_strategy=self.availability_strategy,
|
348
|
+
discovery_policy=self.discovery_policy,
|
349
|
+
parsers=self.parsers,
|
350
|
+
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
351
|
+
errors_collector=self.errors_collector,
|
352
|
+
cursor=cursor,
|
353
|
+
)
|
354
|
+
|
355
|
+
def _make_file_based_stream(
|
356
|
+
self,
|
357
|
+
stream_config: FileBasedStreamConfig,
|
358
|
+
cursor: Optional[AbstractFileBasedCursor],
|
359
|
+
parsed_config: AbstractFileBasedSpec,
|
360
|
+
) -> AbstractFileBasedStream:
|
361
|
+
"""
|
362
|
+
Creates different streams depending on the type of the transfer mode selected
|
363
|
+
"""
|
364
|
+
if use_permissions_transfer(parsed_config):
|
365
|
+
return self._make_permissions_stream(stream_config, cursor)
|
366
|
+
# we should have a stream for File transfer mode to decouple from DefaultFileBasedStream
|
367
|
+
else:
|
368
|
+
return self._make_default_stream(stream_config, cursor, parsed_config)
|
369
|
+
|
370
|
+
def _make_identities_stream(
|
371
|
+
self,
|
372
|
+
) -> Stream:
|
373
|
+
return FileIdentitiesStream(
|
374
|
+
catalog_schema=self.stream_schemas.get(FileIdentitiesStream.IDENTITIES_STREAM_NAME),
|
375
|
+
stream_reader=self.stream_reader,
|
376
|
+
discovery_policy=self.discovery_policy,
|
377
|
+
errors_collector=self.errors_collector,
|
315
378
|
)
|
316
379
|
|
317
380
|
def _get_stream_from_catalog(
|
@@ -378,33 +441,3 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
378
441
|
"`input_schema` and `schemaless` options cannot both be set",
|
379
442
|
model=FileBasedStreamConfig,
|
380
443
|
)
|
381
|
-
|
382
|
-
@staticmethod
|
383
|
-
def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
384
|
-
use_file_transfer = (
|
385
|
-
hasattr(parsed_config.delivery_method, "delivery_type")
|
386
|
-
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
387
|
-
)
|
388
|
-
return use_file_transfer
|
389
|
-
|
390
|
-
@staticmethod
|
391
|
-
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
392
|
-
"""
|
393
|
-
Determines whether to preserve directory structure during file transfer.
|
394
|
-
|
395
|
-
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
-
When disabled, files are flattened to the root of the destination.
|
397
|
-
|
398
|
-
Args:
|
399
|
-
parsed_config: The parsed configuration containing delivery method settings
|
400
|
-
|
401
|
-
Returns:
|
402
|
-
True if directory structure should be preserved (default), False otherwise
|
403
|
-
"""
|
404
|
-
if (
|
405
|
-
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
-
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
|
407
|
-
and parsed_config.delivery_method.preserve_directory_structure is not None
|
408
|
-
):
|
409
|
-
return parsed_config.delivery_method.preserve_directory_structure
|
410
|
-
return True
|