airbyte-cdk 6.34.0.dev0__py3-none-any.whl → 6.34.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +16 -12
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +591 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +160 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +75 -0
- airbyte_cdk/entrypoint.py +6 -6
- airbyte_cdk/logger.py +1 -4
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +56 -25
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +5 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +10 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/file_based_source.py +70 -37
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +107 -12
- airbyte_cdk/sources/file_based/stream/__init__.py +10 -1
- airbyte_cdk/sources/file_based/stream/identities_stream.py +47 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +85 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/utils/mapping_helpers.py +43 -2
- airbyte_cdk/utils/print_buffer.py +0 -4
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev1.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev1.dist-info}/RECORD +28 -19
- airbyte_cdk/connector_builder/message_grouper.py +0 -448
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev1.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev1.dist-info}/entry_points.txt +0 -0
airbyte_cdk/entrypoint.py
CHANGED
@@ -22,7 +22,7 @@ from requests import PreparedRequest, Response, Session
|
|
22
22
|
|
23
23
|
from airbyte_cdk.connector import TConfig
|
24
24
|
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
25
|
-
from airbyte_cdk.logger import
|
25
|
+
from airbyte_cdk.logger import init_logger
|
26
26
|
from airbyte_cdk.models import (
|
27
27
|
AirbyteConnectionStatus,
|
28
28
|
AirbyteMessage,
|
@@ -337,11 +337,11 @@ def launch(source: Source, args: List[str]) -> None:
|
|
337
337
|
parsed_args = source_entrypoint.parse_args(args)
|
338
338
|
# temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
|
339
339
|
# Refer to: https://github.com/airbytehq/oncall/issues/6235
|
340
|
-
with
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
340
|
+
# with PrintBuffer():
|
341
|
+
for message in source_entrypoint.run(parsed_args):
|
342
|
+
# simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
|
343
|
+
# the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
|
344
|
+
print(f"{message}\n", end="", flush=True)
|
345
345
|
|
346
346
|
|
347
347
|
def _init_internal_request_filter() -> None:
|
airbyte_cdk/logger.py
CHANGED
@@ -16,11 +16,8 @@ from airbyte_cdk.models import (
|
|
16
16
|
Level,
|
17
17
|
Type,
|
18
18
|
)
|
19
|
-
from airbyte_cdk.utils import PrintBuffer
|
20
19
|
from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
21
20
|
|
22
|
-
PRINT_BUFFER = PrintBuffer(flush_interval=0.1)
|
23
|
-
|
24
21
|
LOGGING_CONFIG = {
|
25
22
|
"version": 1,
|
26
23
|
"disable_existing_loggers": False,
|
@@ -30,7 +27,7 @@ LOGGING_CONFIG = {
|
|
30
27
|
"handlers": {
|
31
28
|
"console": {
|
32
29
|
"class": "logging.StreamHandler",
|
33
|
-
"stream":
|
30
|
+
"stream": "ext://sys.stdout",
|
34
31
|
"formatter": "airbyte",
|
35
32
|
},
|
36
33
|
},
|
@@ -5,6 +5,7 @@
|
|
5
5
|
import copy
|
6
6
|
import logging
|
7
7
|
import threading
|
8
|
+
import time
|
8
9
|
from collections import OrderedDict
|
9
10
|
from copy import deepcopy
|
10
11
|
from datetime import timedelta
|
@@ -58,7 +59,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
58
59
|
CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
|
59
60
|
"""
|
60
61
|
|
61
|
-
DEFAULT_MAX_PARTITIONS_NUMBER =
|
62
|
+
DEFAULT_MAX_PARTITIONS_NUMBER = 25_000
|
63
|
+
SWITCH_TO_GLOBAL_LIMIT = 10_000
|
62
64
|
_NO_STATE: Mapping[str, Any] = {}
|
63
65
|
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
64
66
|
_GLOBAL_STATE_KEY = "state"
|
@@ -99,9 +101,11 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
99
101
|
self._new_global_cursor: Optional[StreamState] = None
|
100
102
|
self._lookback_window: int = 0
|
101
103
|
self._parent_state: Optional[StreamState] = None
|
102
|
-
self.
|
104
|
+
self._number_of_partitions: int = 0
|
103
105
|
self._use_global_cursor: bool = False
|
104
106
|
self._partition_serializer = PerPartitionKeySerializer()
|
107
|
+
# Track the last time a state message was emitted
|
108
|
+
self._last_emission_time: float = 0.0
|
105
109
|
|
106
110
|
self._set_initial_state(stream_state)
|
107
111
|
|
@@ -141,21 +145,16 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
141
145
|
raise ValueError("stream_slice cannot be None")
|
142
146
|
|
143
147
|
partition_key = self._to_partition_key(stream_slice.partition)
|
144
|
-
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
145
148
|
with self._lock:
|
146
149
|
self._semaphore_per_partition[partition_key].acquire()
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
and self._semaphore_per_partition[partition_key]._value == 0
|
151
|
-
):
|
150
|
+
if not self._use_global_cursor:
|
151
|
+
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
152
|
+
cursor = self._cursor_per_partition[partition_key]
|
152
153
|
if (
|
153
|
-
self.
|
154
|
-
|
155
|
-
< cursor.state[self.cursor_field.cursor_field_key]
|
154
|
+
partition_key in self._finished_partitions
|
155
|
+
and self._semaphore_per_partition[partition_key]._value == 0
|
156
156
|
):
|
157
|
-
self.
|
158
|
-
if not self._use_global_cursor:
|
157
|
+
self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
|
159
158
|
self._emit_state_message()
|
160
159
|
|
161
160
|
def ensure_at_least_one_state_emitted(self) -> None:
|
@@ -169,9 +168,23 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
169
168
|
self._global_cursor = self._new_global_cursor
|
170
169
|
self._lookback_window = self._timer.finish()
|
171
170
|
self._parent_state = self._partition_router.get_stream_state()
|
172
|
-
self._emit_state_message()
|
171
|
+
self._emit_state_message(throttle=False)
|
173
172
|
|
174
|
-
def
|
173
|
+
def _throttle_state_message(self) -> Optional[float]:
|
174
|
+
"""
|
175
|
+
Throttles the state message emission to once every 60 seconds.
|
176
|
+
"""
|
177
|
+
current_time = time.time()
|
178
|
+
if current_time - self._last_emission_time <= 60:
|
179
|
+
return None
|
180
|
+
return current_time
|
181
|
+
|
182
|
+
def _emit_state_message(self, throttle: bool = True) -> None:
|
183
|
+
if throttle:
|
184
|
+
current_time = self._throttle_state_message()
|
185
|
+
if current_time is None:
|
186
|
+
return
|
187
|
+
self._last_emission_time = current_time
|
175
188
|
self._connector_state_manager.update_state_for_stream(
|
176
189
|
self._stream_name,
|
177
190
|
self._stream_namespace,
|
@@ -202,6 +215,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
202
215
|
self._lookback_window if self._global_cursor else 0,
|
203
216
|
)
|
204
217
|
with self._lock:
|
218
|
+
self._number_of_partitions += 1
|
205
219
|
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
206
220
|
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
|
207
221
|
threading.Semaphore(0)
|
@@ -232,9 +246,15 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
232
246
|
- Logs a warning each time a partition is removed, indicating whether it was finished
|
233
247
|
or removed due to being the oldest.
|
234
248
|
"""
|
249
|
+
if not self._use_global_cursor and self.limit_reached():
|
250
|
+
logger.info(
|
251
|
+
f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. "
|
252
|
+
f"Switching to global cursor for {self._stream_name}."
|
253
|
+
)
|
254
|
+
self._use_global_cursor = True
|
255
|
+
|
235
256
|
with self._lock:
|
236
257
|
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
237
|
-
self._over_limit += 1
|
238
258
|
# Try removing finished partitions first
|
239
259
|
for partition_key in list(self._cursor_per_partition.keys()):
|
240
260
|
if (
|
@@ -245,7 +265,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
245
265
|
partition_key
|
246
266
|
) # Remove the oldest partition
|
247
267
|
logger.warning(
|
248
|
-
f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self.
|
268
|
+
f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
|
249
269
|
)
|
250
270
|
break
|
251
271
|
else:
|
@@ -254,7 +274,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
254
274
|
1
|
255
275
|
] # Remove the oldest partition
|
256
276
|
logger.warning(
|
257
|
-
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self.
|
277
|
+
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
|
258
278
|
)
|
259
279
|
|
260
280
|
def _set_initial_state(self, stream_state: StreamState) -> None:
|
@@ -314,6 +334,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
314
334
|
self._lookback_window = int(stream_state.get("lookback_window", 0))
|
315
335
|
|
316
336
|
for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
|
337
|
+
self._number_of_partitions += 1
|
317
338
|
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
318
339
|
self._create_cursor(state["cursor"])
|
319
340
|
)
|
@@ -354,16 +375,26 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
354
375
|
self._new_global_cursor = deepcopy(fixed_global_state)
|
355
376
|
|
356
377
|
def observe(self, record: Record) -> None:
|
357
|
-
if not self._use_global_cursor and self.limit_reached():
|
358
|
-
self._use_global_cursor = True
|
359
|
-
|
360
378
|
if not record.associated_slice:
|
361
379
|
raise ValueError(
|
362
380
|
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
363
381
|
)
|
364
|
-
|
365
|
-
|
366
|
-
|
382
|
+
|
383
|
+
record_cursor = self._connector_state_converter.output_format(
|
384
|
+
self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
385
|
+
)
|
386
|
+
self._update_global_cursor(record_cursor)
|
387
|
+
if not self._use_global_cursor:
|
388
|
+
self._cursor_per_partition[
|
389
|
+
self._to_partition_key(record.associated_slice.partition)
|
390
|
+
].observe(record)
|
391
|
+
|
392
|
+
def _update_global_cursor(self, value: Any) -> None:
|
393
|
+
if (
|
394
|
+
self._new_global_cursor is None
|
395
|
+
or self._new_global_cursor[self.cursor_field.cursor_field_key] < value
|
396
|
+
):
|
397
|
+
self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)}
|
367
398
|
|
368
399
|
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
369
400
|
return self._partition_serializer.to_partition_key(partition)
|
@@ -397,4 +428,4 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
397
428
|
return cursor
|
398
429
|
|
399
430
|
def limit_reached(self) -> bool:
|
400
|
-
return self.
|
431
|
+
return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
|
@@ -21,6 +21,7 @@ from airbyte_cdk.sources.declarative.requesters.request_option import (
|
|
21
21
|
)
|
22
22
|
from airbyte_cdk.sources.message import MessageRepository
|
23
23
|
from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
|
24
|
+
from airbyte_cdk.utils.mapping_helpers import _validate_component_request_option_paths
|
24
25
|
|
25
26
|
|
26
27
|
@dataclass
|
@@ -122,6 +123,10 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
122
123
|
if not self.cursor_datetime_formats:
|
123
124
|
self.cursor_datetime_formats = [self.datetime_format]
|
124
125
|
|
126
|
+
_validate_component_request_option_paths(
|
127
|
+
self.config, self.start_time_option, self.end_time_option
|
128
|
+
)
|
129
|
+
|
125
130
|
def get_stream_state(self) -> StreamState:
|
126
131
|
return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
127
132
|
|
@@ -23,6 +23,9 @@ from airbyte_cdk.sources.declarative.requesters.request_option import (
|
|
23
23
|
)
|
24
24
|
from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
|
25
25
|
from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
|
26
|
+
from airbyte_cdk.utils.mapping_helpers import (
|
27
|
+
_validate_component_request_option_paths,
|
28
|
+
)
|
26
29
|
|
27
30
|
|
28
31
|
@dataclass
|
@@ -113,6 +116,13 @@ class DefaultPaginator(Paginator):
|
|
113
116
|
if isinstance(self.url_base, str):
|
114
117
|
self.url_base = InterpolatedString(string=self.url_base, parameters=parameters)
|
115
118
|
|
119
|
+
if self.page_token_option and not isinstance(self.page_token_option, RequestPath):
|
120
|
+
_validate_component_request_option_paths(
|
121
|
+
self.config,
|
122
|
+
self.page_size_option,
|
123
|
+
self.page_token_option,
|
124
|
+
)
|
125
|
+
|
116
126
|
def get_initial_token(self) -> Optional[Any]:
|
117
127
|
"""
|
118
128
|
Return the page token that should be used for the first request of a stream
|
@@ -11,6 +11,7 @@ from pydantic.v1 import AnyUrl, BaseModel, Field
|
|
11
11
|
|
12
12
|
from airbyte_cdk import OneOfOptionConfig
|
13
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
14
|
+
from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
|
14
15
|
from airbyte_cdk.sources.utils import schema_helpers
|
15
16
|
|
16
17
|
|
@@ -65,7 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
|
|
65
66
|
order=10,
|
66
67
|
)
|
67
68
|
|
68
|
-
delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field(
|
69
|
+
delivery_method: Union[DeliverRecords, DeliverRawFiles, DeliverPermissions] = Field(
|
69
70
|
title="Delivery Method",
|
70
71
|
discriminator="delivery_type",
|
71
72
|
type="object",
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
|
6
|
+
AbstractFileBasedSpec,
|
7
|
+
DeliverRawFiles,
|
8
|
+
)
|
9
|
+
from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
|
10
|
+
|
11
|
+
DELIVERY_TYPE_KEY = "delivery_type"
|
12
|
+
DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE = "use_permissions_transfer"
|
13
|
+
DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE = "use_file_transfer"
|
14
|
+
PRESERVE_DIRECTORY_STRUCTURE_KEY = "preserve_directory_structure"
|
15
|
+
INCLUDE_IDENTITIES_STREAM_KEY = "include_identities_stream"
|
16
|
+
|
17
|
+
|
18
|
+
def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
19
|
+
"""Returns `True` if the configuration uses file transfer mode."""
|
20
|
+
return (
|
21
|
+
hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
|
22
|
+
and parsed_config.delivery_method.delivery_type == DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
27
|
+
"""
|
28
|
+
Determines whether to preserve directory structure during file transfer.
|
29
|
+
|
30
|
+
When enabled, files maintain their subdirectory paths in the destination.
|
31
|
+
When disabled, files are flattened to the root of the destination.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
parsed_config: The parsed configuration containing delivery method settings
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
True if directory structure should be preserved (default), False otherwise
|
38
|
+
"""
|
39
|
+
if (
|
40
|
+
use_file_transfer(parsed_config)
|
41
|
+
and hasattr(parsed_config.delivery_method, PRESERVE_DIRECTORY_STRUCTURE_KEY)
|
42
|
+
and isinstance(parsed_config.delivery_method, DeliverRawFiles)
|
43
|
+
):
|
44
|
+
return parsed_config.delivery_method.preserve_directory_structure
|
45
|
+
return True
|
46
|
+
|
47
|
+
|
48
|
+
def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
49
|
+
"""
|
50
|
+
Determines whether to use permissions transfer to sync ACLs and Identities
|
51
|
+
|
52
|
+
Args:
|
53
|
+
parsed_config: The parsed configuration containing delivery method settings
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
True if permissions transfer should be enabled, False otherwise
|
57
|
+
"""
|
58
|
+
return (
|
59
|
+
hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
|
60
|
+
and parsed_config.delivery_method.delivery_type
|
61
|
+
== DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE
|
62
|
+
)
|
63
|
+
|
64
|
+
|
65
|
+
def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool:
|
66
|
+
"""
|
67
|
+
There are scenarios where user may not have access to identities but still is valuable to get ACLs
|
68
|
+
|
69
|
+
Args:
|
70
|
+
parsed_config: The parsed configuration containing delivery method settings
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
True if we should include Identities stream.
|
74
|
+
"""
|
75
|
+
if (
|
76
|
+
use_permissions_transfer(parsed_config)
|
77
|
+
and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY)
|
78
|
+
and isinstance(parsed_config.delivery_method, DeliverPermissions)
|
79
|
+
):
|
80
|
+
return parsed_config.delivery_method.include_identities_stream
|
81
|
+
return False
|
@@ -33,6 +33,12 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
|
33
33
|
FileBasedStreamConfig,
|
34
34
|
ValidationPolicy,
|
35
35
|
)
|
36
|
+
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
37
|
+
include_identities_stream,
|
38
|
+
preserve_directory_structure,
|
39
|
+
use_file_transfer,
|
40
|
+
use_permissions_transfer,
|
41
|
+
)
|
36
42
|
from airbyte_cdk.sources.file_based.discovery_policy import (
|
37
43
|
AbstractDiscoveryPolicy,
|
38
44
|
DefaultDiscoveryPolicy,
|
@@ -49,7 +55,12 @@ from airbyte_cdk.sources.file_based.schema_validation_policies import (
|
|
49
55
|
DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
50
56
|
AbstractSchemaValidationPolicy,
|
51
57
|
)
|
52
|
-
from airbyte_cdk.sources.file_based.stream import
|
58
|
+
from airbyte_cdk.sources.file_based.stream import (
|
59
|
+
AbstractFileBasedStream,
|
60
|
+
DefaultFileBasedStream,
|
61
|
+
FileIdentitiesStream,
|
62
|
+
PermissionsFileBasedStream,
|
63
|
+
)
|
53
64
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
54
65
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
55
66
|
AbstractConcurrentFileBasedCursor,
|
@@ -66,6 +77,7 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
66
77
|
DEFAULT_CONCURRENCY = 100
|
67
78
|
MAX_CONCURRENCY = 100
|
68
79
|
INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2
|
80
|
+
IDENTITIES_STREAM = "identities"
|
69
81
|
|
70
82
|
|
71
83
|
class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
@@ -157,13 +169,20 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
157
169
|
errors = []
|
158
170
|
tracebacks = []
|
159
171
|
for stream in streams:
|
172
|
+
if isinstance(stream, FileIdentitiesStream):
|
173
|
+
identity = next(iter(stream.load_identity_groups()))
|
174
|
+
if not identity:
|
175
|
+
errors.append(
|
176
|
+
"Unable to get identities for current configuration, please check your credentials"
|
177
|
+
)
|
178
|
+
continue
|
160
179
|
if not isinstance(stream, AbstractFileBasedStream):
|
161
180
|
raise ValueError(f"Stream {stream} is not a file-based stream.")
|
162
181
|
try:
|
163
182
|
parsed_config = self._get_parsed_config(config)
|
164
183
|
availability_method = (
|
165
184
|
stream.availability_strategy.check_availability
|
166
|
-
if
|
185
|
+
if use_file_transfer(parsed_config) or use_permissions_transfer(parsed_config)
|
167
186
|
else stream.availability_strategy.check_availability_and_parsability
|
168
187
|
)
|
169
188
|
(
|
@@ -239,7 +258,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
239
258
|
message_repository=self.message_repository,
|
240
259
|
)
|
241
260
|
stream = FileBasedStreamFacade.create_from_stream(
|
242
|
-
stream=self.
|
261
|
+
stream=self._make_file_based_stream(
|
243
262
|
stream_config=stream_config,
|
244
263
|
cursor=cursor,
|
245
264
|
parsed_config=parsed_config,
|
@@ -270,7 +289,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
270
289
|
CursorField(DefaultFileBasedStream.ab_last_mod_col),
|
271
290
|
)
|
272
291
|
stream = FileBasedStreamFacade.create_from_stream(
|
273
|
-
stream=self.
|
292
|
+
stream=self._make_file_based_stream(
|
274
293
|
stream_config=stream_config,
|
275
294
|
cursor=cursor,
|
276
295
|
parsed_config=parsed_config,
|
@@ -282,13 +301,17 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
282
301
|
)
|
283
302
|
else:
|
284
303
|
cursor = self.cursor_cls(stream_config)
|
285
|
-
stream = self.
|
304
|
+
stream = self._make_file_based_stream(
|
286
305
|
stream_config=stream_config,
|
287
306
|
cursor=cursor,
|
288
307
|
parsed_config=parsed_config,
|
289
308
|
)
|
290
309
|
|
291
310
|
streams.append(stream)
|
311
|
+
|
312
|
+
if include_identities_stream(parsed_config):
|
313
|
+
identities_stream = self._make_identities_stream()
|
314
|
+
streams.append(identities_stream)
|
292
315
|
return streams
|
293
316
|
|
294
317
|
except ValidationError as exc:
|
@@ -310,8 +333,48 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
310
333
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
311
334
|
errors_collector=self.errors_collector,
|
312
335
|
cursor=cursor,
|
313
|
-
use_file_transfer=
|
314
|
-
preserve_directory_structure=
|
336
|
+
use_file_transfer=use_file_transfer(parsed_config),
|
337
|
+
preserve_directory_structure=preserve_directory_structure(parsed_config),
|
338
|
+
)
|
339
|
+
|
340
|
+
def _make_permissions_stream(
|
341
|
+
self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
|
342
|
+
) -> AbstractFileBasedStream:
|
343
|
+
return PermissionsFileBasedStream(
|
344
|
+
config=stream_config,
|
345
|
+
catalog_schema=self.stream_schemas.get(stream_config.name),
|
346
|
+
stream_reader=self.stream_reader,
|
347
|
+
availability_strategy=self.availability_strategy,
|
348
|
+
discovery_policy=self.discovery_policy,
|
349
|
+
parsers=self.parsers,
|
350
|
+
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
351
|
+
errors_collector=self.errors_collector,
|
352
|
+
cursor=cursor,
|
353
|
+
)
|
354
|
+
|
355
|
+
def _make_file_based_stream(
|
356
|
+
self,
|
357
|
+
stream_config: FileBasedStreamConfig,
|
358
|
+
cursor: Optional[AbstractFileBasedCursor],
|
359
|
+
parsed_config: AbstractFileBasedSpec,
|
360
|
+
) -> AbstractFileBasedStream:
|
361
|
+
"""
|
362
|
+
Creates different streams depending on the type of the transfer mode selected
|
363
|
+
"""
|
364
|
+
if use_permissions_transfer(parsed_config):
|
365
|
+
return self._make_permissions_stream(stream_config, cursor)
|
366
|
+
# we should have a stream for File transfer mode to decouple from DefaultFileBasedStream
|
367
|
+
else:
|
368
|
+
return self._make_default_stream(stream_config, cursor, parsed_config)
|
369
|
+
|
370
|
+
def _make_identities_stream(
|
371
|
+
self,
|
372
|
+
) -> Stream:
|
373
|
+
return FileIdentitiesStream(
|
374
|
+
catalog_schema=self.stream_schemas.get(FileIdentitiesStream.IDENTITIES_STREAM_NAME),
|
375
|
+
stream_reader=self.stream_reader,
|
376
|
+
discovery_policy=self.discovery_policy,
|
377
|
+
errors_collector=self.errors_collector,
|
315
378
|
)
|
316
379
|
|
317
380
|
def _get_stream_from_catalog(
|
@@ -378,33 +441,3 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
378
441
|
"`input_schema` and `schemaless` options cannot both be set",
|
379
442
|
model=FileBasedStreamConfig,
|
380
443
|
)
|
381
|
-
|
382
|
-
@staticmethod
|
383
|
-
def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
384
|
-
use_file_transfer = (
|
385
|
-
hasattr(parsed_config.delivery_method, "delivery_type")
|
386
|
-
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
387
|
-
)
|
388
|
-
return use_file_transfer
|
389
|
-
|
390
|
-
@staticmethod
|
391
|
-
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
392
|
-
"""
|
393
|
-
Determines whether to preserve directory structure during file transfer.
|
394
|
-
|
395
|
-
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
-
When disabled, files are flattened to the root of the destination.
|
397
|
-
|
398
|
-
Args:
|
399
|
-
parsed_config: The parsed configuration containing delivery method settings
|
400
|
-
|
401
|
-
Returns:
|
402
|
-
True if directory structure should be preserved (default), False otherwise
|
403
|
-
"""
|
404
|
-
if (
|
405
|
-
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
-
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
|
407
|
-
and parsed_config.delivery_method.preserve_directory_structure is not None
|
408
|
-
):
|
409
|
-
return parsed_config.delivery_method.preserve_directory_structure
|
410
|
-
return True
|