airbyte-cdk 6.34.0.dev0__py3-none-any.whl → 6.34.0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +16 -12
  2. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  3. airbyte_cdk/connector_builder/test_reader/helpers.py +591 -0
  4. airbyte_cdk/connector_builder/test_reader/message_grouper.py +160 -0
  5. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  6. airbyte_cdk/connector_builder/test_reader/types.py +75 -0
  7. airbyte_cdk/entrypoint.py +6 -6
  8. airbyte_cdk/logger.py +1 -4
  9. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +122 -38
  10. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +5 -0
  11. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +10 -0
  12. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
  13. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  14. airbyte_cdk/sources/file_based/file_based_source.py +70 -37
  15. airbyte_cdk/sources/file_based/file_based_stream_reader.py +107 -12
  16. airbyte_cdk/sources/file_based/stream/__init__.py +10 -1
  17. airbyte_cdk/sources/file_based/stream/identities_stream.py +47 -0
  18. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +85 -0
  19. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  20. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  21. airbyte_cdk/test/mock_http/mocker.py +9 -1
  22. airbyte_cdk/test/mock_http/response.py +6 -3
  23. airbyte_cdk/utils/mapping_helpers.py +43 -2
  24. airbyte_cdk/utils/print_buffer.py +0 -4
  25. {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/METADATA +1 -1
  26. {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/RECORD +30 -21
  27. airbyte_cdk/connector_builder/message_grouper.py +0 -448
  28. {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/LICENSE.txt +0 -0
  29. {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/LICENSE_SHORT +0 -0
  30. {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/WHEEL +0 -0
  31. {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/entry_points.txt +0 -0
airbyte_cdk/entrypoint.py CHANGED
@@ -22,7 +22,7 @@ from requests import PreparedRequest, Response, Session
22
22
 
23
23
  from airbyte_cdk.connector import TConfig
24
24
  from airbyte_cdk.exception_handler import init_uncaught_exception_handler
25
- from airbyte_cdk.logger import PRINT_BUFFER, init_logger
25
+ from airbyte_cdk.logger import init_logger
26
26
  from airbyte_cdk.models import (
27
27
  AirbyteConnectionStatus,
28
28
  AirbyteMessage,
@@ -337,11 +337,11 @@ def launch(source: Source, args: List[str]) -> None:
337
337
  parsed_args = source_entrypoint.parse_args(args)
338
338
  # temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
339
339
  # Refer to: https://github.com/airbytehq/oncall/issues/6235
340
- with PRINT_BUFFER:
341
- for message in source_entrypoint.run(parsed_args):
342
- # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
343
- # the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
344
- print(f"{message}\n", end="")
340
+ # with PrintBuffer():
341
+ for message in source_entrypoint.run(parsed_args):
342
+ # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
343
+ # the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
344
+ print(f"{message}\n", end="", flush=True)
345
345
 
346
346
 
347
347
  def _init_internal_request_filter() -> None:
airbyte_cdk/logger.py CHANGED
@@ -16,11 +16,8 @@ from airbyte_cdk.models import (
16
16
  Level,
17
17
  Type,
18
18
  )
19
- from airbyte_cdk.utils import PrintBuffer
20
19
  from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
21
20
 
22
- PRINT_BUFFER = PrintBuffer(flush_interval=0.1)
23
-
24
21
  LOGGING_CONFIG = {
25
22
  "version": 1,
26
23
  "disable_existing_loggers": False,
@@ -30,7 +27,7 @@ LOGGING_CONFIG = {
30
27
  "handlers": {
31
28
  "console": {
32
29
  "class": "logging.StreamHandler",
33
- "stream": PRINT_BUFFER,
30
+ "stream": "ext://sys.stdout",
34
31
  "formatter": "airbyte",
35
32
  },
36
33
  },
@@ -5,6 +5,7 @@
5
5
  import copy
6
6
  import logging
7
7
  import threading
8
+ import time
8
9
  from collections import OrderedDict
9
10
  from copy import deepcopy
10
11
  from datetime import timedelta
@@ -58,7 +59,8 @@ class ConcurrentPerPartitionCursor(Cursor):
58
59
  CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
59
60
  """
60
61
 
61
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
62
+ DEFAULT_MAX_PARTITIONS_NUMBER = 25_000
63
+ SWITCH_TO_GLOBAL_LIMIT = 10_000
62
64
  _NO_STATE: Mapping[str, Any] = {}
63
65
  _NO_CURSOR_STATE: Mapping[str, Any] = {}
64
66
  _GLOBAL_STATE_KEY = "state"
@@ -93,15 +95,21 @@ class ConcurrentPerPartitionCursor(Cursor):
93
95
  # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
94
96
  self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
95
97
  self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
98
+
99
+ # Parent-state tracking: store each partition’s parent state in creation order
100
+ self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
101
+
96
102
  self._finished_partitions: set[str] = set()
97
103
  self._lock = threading.Lock()
98
104
  self._timer = Timer()
99
105
  self._new_global_cursor: Optional[StreamState] = None
100
106
  self._lookback_window: int = 0
101
107
  self._parent_state: Optional[StreamState] = None
102
- self._over_limit: int = 0
108
+ self._number_of_partitions: int = 0
103
109
  self._use_global_cursor: bool = False
104
110
  self._partition_serializer = PerPartitionKeySerializer()
111
+ # Track the last time a state message was emitted
112
+ self._last_emission_time: float = 0.0
105
113
 
106
114
  self._set_initial_state(stream_state)
107
115
 
@@ -141,22 +149,55 @@ class ConcurrentPerPartitionCursor(Cursor):
141
149
  raise ValueError("stream_slice cannot be None")
142
150
 
143
151
  partition_key = self._to_partition_key(stream_slice.partition)
144
- self._cursor_per_partition[partition_key].close_partition(partition=partition)
145
152
  with self._lock:
146
153
  self._semaphore_per_partition[partition_key].acquire()
147
- cursor = self._cursor_per_partition[partition_key]
148
- if (
149
- partition_key in self._finished_partitions
150
- and self._semaphore_per_partition[partition_key]._value == 0
151
- ):
154
+ if not self._use_global_cursor:
155
+ self._cursor_per_partition[partition_key].close_partition(partition=partition)
156
+ cursor = self._cursor_per_partition[partition_key]
152
157
  if (
153
- self._new_global_cursor is None
154
- or self._new_global_cursor[self.cursor_field.cursor_field_key]
155
- < cursor.state[self.cursor_field.cursor_field_key]
158
+ partition_key in self._finished_partitions
159
+ and self._semaphore_per_partition[partition_key]._value == 0
156
160
  ):
157
- self._new_global_cursor = copy.deepcopy(cursor.state)
158
- if not self._use_global_cursor:
159
- self._emit_state_message()
161
+ self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
162
+
163
+ self._check_and_update_parent_state()
164
+
165
+ self._emit_state_message()
166
+
167
+ def _check_and_update_parent_state(self) -> None:
168
+ """
169
+ Pop the leftmost partition state from _partition_parent_state_map only if
170
+ *all partitions* up to (and including) that partition key in _semaphore_per_partition
171
+ are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
172
+ """
173
+ last_closed_state = None
174
+
175
+ while self._partition_parent_state_map:
176
+ # Look at the earliest partition key in creation order
177
+ earliest_key = next(iter(self._partition_parent_state_map))
178
+
179
+ # Verify ALL partitions from the left up to earliest_key are finished
180
+ all_left_finished = True
181
+ for p_key, sem in self._semaphore_per_partition.items():
182
+ # If any earlier partition is still not finished, we must stop
183
+ if p_key not in self._finished_partitions or sem._value != 0:
184
+ all_left_finished = False
185
+ break
186
+ # Once we've reached earliest_key in the semaphore order, we can stop checking
187
+ if p_key == earliest_key:
188
+ break
189
+
190
+ # If the partitions up to earliest_key are not all finished, break the while-loop
191
+ if not all_left_finished:
192
+ break
193
+
194
+ # Otherwise, pop the leftmost entry from parent-state map
195
+ _, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
196
+ last_closed_state = closed_parent_state
197
+
198
+ # Update _parent_state if we actually popped at least one partition
199
+ if last_closed_state is not None:
200
+ self._parent_state = last_closed_state
160
201
 
161
202
  def ensure_at_least_one_state_emitted(self) -> None:
162
203
  """
@@ -169,9 +210,23 @@ class ConcurrentPerPartitionCursor(Cursor):
169
210
  self._global_cursor = self._new_global_cursor
170
211
  self._lookback_window = self._timer.finish()
171
212
  self._parent_state = self._partition_router.get_stream_state()
172
- self._emit_state_message()
213
+ self._emit_state_message(throttle=False)
173
214
 
174
- def _emit_state_message(self) -> None:
215
+ def _throttle_state_message(self) -> Optional[float]:
216
+ """
217
+ Throttles the state message emission to once every 60 seconds.
218
+ """
219
+ current_time = time.time()
220
+ if current_time - self._last_emission_time <= 60:
221
+ return None
222
+ return current_time
223
+
224
+ def _emit_state_message(self, throttle: bool = True) -> None:
225
+ if throttle:
226
+ current_time = self._throttle_state_message()
227
+ if current_time is None:
228
+ return
229
+ self._last_emission_time = current_time
175
230
  self._connector_state_manager.update_state_for_stream(
176
231
  self._stream_name,
177
232
  self._stream_namespace,
@@ -188,13 +243,19 @@ class ConcurrentPerPartitionCursor(Cursor):
188
243
 
189
244
  slices = self._partition_router.stream_slices()
190
245
  self._timer.start()
191
- for partition in slices:
192
- yield from self._generate_slices_from_partition(partition)
246
+ for partition, last, parent_state in iterate_with_last_flag_and_state(
247
+ slices, self._partition_router.get_stream_state
248
+ ):
249
+ yield from self._generate_slices_from_partition(partition, parent_state)
193
250
 
194
- def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
251
+ def _generate_slices_from_partition(
252
+ self, partition: StreamSlice, parent_state: Mapping[str, Any]
253
+ ) -> Iterable[StreamSlice]:
195
254
  # Ensure the maximum number of partitions is not exceeded
196
255
  self._ensure_partition_limit()
197
256
 
257
+ partition_key = self._to_partition_key(partition.partition)
258
+
198
259
  cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
199
260
  if not cursor:
200
261
  cursor = self._create_cursor(
@@ -202,18 +263,27 @@ class ConcurrentPerPartitionCursor(Cursor):
202
263
  self._lookback_window if self._global_cursor else 0,
203
264
  )
204
265
  with self._lock:
205
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
206
- self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
207
- threading.Semaphore(0)
208
- )
266
+ self._number_of_partitions += 1
267
+ self._cursor_per_partition[partition_key] = cursor
268
+ self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
269
+
270
+ with self._lock:
271
+ if (
272
+ len(self._partition_parent_state_map) == 0
273
+ or self._partition_parent_state_map[
274
+ next(reversed(self._partition_parent_state_map))
275
+ ]
276
+ != parent_state
277
+ ):
278
+ self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
209
279
 
210
280
  for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
211
281
  cursor.stream_slices(),
212
282
  lambda: None,
213
283
  ):
214
- self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
284
+ self._semaphore_per_partition[partition_key].release()
215
285
  if is_last_slice:
216
- self._finished_partitions.add(self._to_partition_key(partition.partition))
286
+ self._finished_partitions.add(partition_key)
217
287
  yield StreamSlice(
218
288
  partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
219
289
  )
@@ -232,9 +302,15 @@ class ConcurrentPerPartitionCursor(Cursor):
232
302
  - Logs a warning each time a partition is removed, indicating whether it was finished
233
303
  or removed due to being the oldest.
234
304
  """
305
+ if not self._use_global_cursor and self.limit_reached():
306
+ logger.info(
307
+ f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. "
308
+ f"Switching to global cursor for {self._stream_name}."
309
+ )
310
+ self._use_global_cursor = True
311
+
235
312
  with self._lock:
236
313
  while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
237
- self._over_limit += 1
238
314
  # Try removing finished partitions first
239
315
  for partition_key in list(self._cursor_per_partition.keys()):
240
316
  if (
@@ -245,7 +321,7 @@ class ConcurrentPerPartitionCursor(Cursor):
245
321
  partition_key
246
322
  ) # Remove the oldest partition
247
323
  logger.warning(
248
- f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._over_limit}."
324
+ f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
249
325
  )
250
326
  break
251
327
  else:
@@ -254,7 +330,7 @@ class ConcurrentPerPartitionCursor(Cursor):
254
330
  1
255
331
  ] # Remove the oldest partition
256
332
  logger.warning(
257
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
333
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
258
334
  )
259
335
 
260
336
  def _set_initial_state(self, stream_state: StreamState) -> None:
@@ -314,12 +390,10 @@ class ConcurrentPerPartitionCursor(Cursor):
314
390
  self._lookback_window = int(stream_state.get("lookback_window", 0))
315
391
 
316
392
  for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
393
+ self._number_of_partitions += 1
317
394
  self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
318
395
  self._create_cursor(state["cursor"])
319
396
  )
320
- self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
321
- threading.Semaphore(0)
322
- )
323
397
 
324
398
  # set default state for missing partitions if it is per partition with fallback to global
325
399
  if self._GLOBAL_STATE_KEY in stream_state:
@@ -354,16 +428,26 @@ class ConcurrentPerPartitionCursor(Cursor):
354
428
  self._new_global_cursor = deepcopy(fixed_global_state)
355
429
 
356
430
  def observe(self, record: Record) -> None:
357
- if not self._use_global_cursor and self.limit_reached():
358
- self._use_global_cursor = True
359
-
360
431
  if not record.associated_slice:
361
432
  raise ValueError(
362
433
  "Invalid state as stream slices that are emitted should refer to an existing cursor"
363
434
  )
364
- self._cursor_per_partition[
365
- self._to_partition_key(record.associated_slice.partition)
366
- ].observe(record)
435
+
436
+ record_cursor = self._connector_state_converter.output_format(
437
+ self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
438
+ )
439
+ self._update_global_cursor(record_cursor)
440
+ if not self._use_global_cursor:
441
+ self._cursor_per_partition[
442
+ self._to_partition_key(record.associated_slice.partition)
443
+ ].observe(record)
444
+
445
+ def _update_global_cursor(self, value: Any) -> None:
446
+ if (
447
+ self._new_global_cursor is None
448
+ or self._new_global_cursor[self.cursor_field.cursor_field_key] < value
449
+ ):
450
+ self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)}
367
451
 
368
452
  def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
369
453
  return self._partition_serializer.to_partition_key(partition)
@@ -397,4 +481,4 @@ class ConcurrentPerPartitionCursor(Cursor):
397
481
  return cursor
398
482
 
399
483
  def limit_reached(self) -> bool:
400
- return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
484
+ return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
@@ -21,6 +21,7 @@ from airbyte_cdk.sources.declarative.requesters.request_option import (
21
21
  )
22
22
  from airbyte_cdk.sources.message import MessageRepository
23
23
  from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
24
+ from airbyte_cdk.utils.mapping_helpers import _validate_component_request_option_paths
24
25
 
25
26
 
26
27
  @dataclass
@@ -122,6 +123,10 @@ class DatetimeBasedCursor(DeclarativeCursor):
122
123
  if not self.cursor_datetime_formats:
123
124
  self.cursor_datetime_formats = [self.datetime_format]
124
125
 
126
+ _validate_component_request_option_paths(
127
+ self.config, self.start_time_option, self.end_time_option
128
+ )
129
+
125
130
  def get_stream_state(self) -> StreamState:
126
131
  return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
127
132
 
@@ -23,6 +23,9 @@ from airbyte_cdk.sources.declarative.requesters.request_option import (
23
23
  )
24
24
  from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
25
25
  from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
26
+ from airbyte_cdk.utils.mapping_helpers import (
27
+ _validate_component_request_option_paths,
28
+ )
26
29
 
27
30
 
28
31
  @dataclass
@@ -113,6 +116,13 @@ class DefaultPaginator(Paginator):
113
116
  if isinstance(self.url_base, str):
114
117
  self.url_base = InterpolatedString(string=self.url_base, parameters=parameters)
115
118
 
119
+ if self.page_token_option and not isinstance(self.page_token_option, RequestPath):
120
+ _validate_component_request_option_paths(
121
+ self.config,
122
+ self.page_size_option,
123
+ self.page_token_option,
124
+ )
125
+
116
126
  def get_initial_token(self) -> Optional[Any]:
117
127
  """
118
128
  Return the page token that should be used for the first request of a stream
@@ -11,6 +11,7 @@ from pydantic.v1 import AnyUrl, BaseModel, Field
11
11
 
12
12
  from airbyte_cdk import OneOfOptionConfig
13
13
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
14
+ from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
14
15
  from airbyte_cdk.sources.utils import schema_helpers
15
16
 
16
17
 
@@ -65,7 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
65
66
  order=10,
66
67
  )
67
68
 
68
- delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field(
69
+ delivery_method: Union[DeliverRecords, DeliverRawFiles, DeliverPermissions] = Field(
69
70
  title="Delivery Method",
70
71
  discriminator="delivery_type",
71
72
  type="object",
@@ -0,0 +1,81 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
6
+ AbstractFileBasedSpec,
7
+ DeliverRawFiles,
8
+ )
9
+ from airbyte_cdk.sources.specs.transfer_modes import DeliverPermissions
10
+
11
+ DELIVERY_TYPE_KEY = "delivery_type"
12
+ DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE = "use_permissions_transfer"
13
+ DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE = "use_file_transfer"
14
+ PRESERVE_DIRECTORY_STRUCTURE_KEY = "preserve_directory_structure"
15
+ INCLUDE_IDENTITIES_STREAM_KEY = "include_identities_stream"
16
+
17
+
18
+ def use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
19
+ """Returns `True` if the configuration uses file transfer mode."""
20
+ return (
21
+ hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
22
+ and parsed_config.delivery_method.delivery_type == DELIVERY_TYPE_FILES_TRANSFER_MODE_VALUE
23
+ )
24
+
25
+
26
+ def preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
27
+ """
28
+ Determines whether to preserve directory structure during file transfer.
29
+
30
+ When enabled, files maintain their subdirectory paths in the destination.
31
+ When disabled, files are flattened to the root of the destination.
32
+
33
+ Args:
34
+ parsed_config: The parsed configuration containing delivery method settings
35
+
36
+ Returns:
37
+ True if directory structure should be preserved (default), False otherwise
38
+ """
39
+ if (
40
+ use_file_transfer(parsed_config)
41
+ and hasattr(parsed_config.delivery_method, PRESERVE_DIRECTORY_STRUCTURE_KEY)
42
+ and isinstance(parsed_config.delivery_method, DeliverRawFiles)
43
+ ):
44
+ return parsed_config.delivery_method.preserve_directory_structure
45
+ return True
46
+
47
+
48
+ def use_permissions_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
49
+ """
50
+ Determines whether to use permissions transfer to sync ACLs and Identities
51
+
52
+ Args:
53
+ parsed_config: The parsed configuration containing delivery method settings
54
+
55
+ Returns:
56
+ True if permissions transfer should be enabled, False otherwise
57
+ """
58
+ return (
59
+ hasattr(parsed_config.delivery_method, DELIVERY_TYPE_KEY)
60
+ and parsed_config.delivery_method.delivery_type
61
+ == DELIVERY_TYPE_PERMISSION_TRANSFER_MODE_VALUE
62
+ )
63
+
64
+
65
+ def include_identities_stream(parsed_config: AbstractFileBasedSpec) -> bool:
66
+ """
67
+ There are scenarios where user may not have access to identities but still is valuable to get ACLs
68
+
69
+ Args:
70
+ parsed_config: The parsed configuration containing delivery method settings
71
+
72
+ Returns:
73
+ True if we should include Identities stream.
74
+ """
75
+ if (
76
+ use_permissions_transfer(parsed_config)
77
+ and hasattr(parsed_config.delivery_method, INCLUDE_IDENTITIES_STREAM_KEY)
78
+ and isinstance(parsed_config.delivery_method, DeliverPermissions)
79
+ ):
80
+ return parsed_config.delivery_method.include_identities_stream
81
+ return False
@@ -33,6 +33,12 @@ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
33
33
  FileBasedStreamConfig,
34
34
  ValidationPolicy,
35
35
  )
36
+ from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
37
+ include_identities_stream,
38
+ preserve_directory_structure,
39
+ use_file_transfer,
40
+ use_permissions_transfer,
41
+ )
36
42
  from airbyte_cdk.sources.file_based.discovery_policy import (
37
43
  AbstractDiscoveryPolicy,
38
44
  DefaultDiscoveryPolicy,
@@ -49,7 +55,12 @@ from airbyte_cdk.sources.file_based.schema_validation_policies import (
49
55
  DEFAULT_SCHEMA_VALIDATION_POLICIES,
50
56
  AbstractSchemaValidationPolicy,
51
57
  )
52
- from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
58
+ from airbyte_cdk.sources.file_based.stream import (
59
+ AbstractFileBasedStream,
60
+ DefaultFileBasedStream,
61
+ FileIdentitiesStream,
62
+ PermissionsFileBasedStream,
63
+ )
53
64
  from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
54
65
  from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
55
66
  AbstractConcurrentFileBasedCursor,
@@ -66,6 +77,7 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
66
77
  DEFAULT_CONCURRENCY = 100
67
78
  MAX_CONCURRENCY = 100
68
79
  INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2
80
+ IDENTITIES_STREAM = "identities"
69
81
 
70
82
 
71
83
  class FileBasedSource(ConcurrentSourceAdapter, ABC):
@@ -157,13 +169,20 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
157
169
  errors = []
158
170
  tracebacks = []
159
171
  for stream in streams:
172
+ if isinstance(stream, FileIdentitiesStream):
173
+ identity = next(iter(stream.load_identity_groups()))
174
+ if not identity:
175
+ errors.append(
176
+ "Unable to get identities for current configuration, please check your credentials"
177
+ )
178
+ continue
160
179
  if not isinstance(stream, AbstractFileBasedStream):
161
180
  raise ValueError(f"Stream {stream} is not a file-based stream.")
162
181
  try:
163
182
  parsed_config = self._get_parsed_config(config)
164
183
  availability_method = (
165
184
  stream.availability_strategy.check_availability
166
- if self._use_file_transfer(parsed_config)
185
+ if use_file_transfer(parsed_config) or use_permissions_transfer(parsed_config)
167
186
  else stream.availability_strategy.check_availability_and_parsability
168
187
  )
169
188
  (
@@ -239,7 +258,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
239
258
  message_repository=self.message_repository,
240
259
  )
241
260
  stream = FileBasedStreamFacade.create_from_stream(
242
- stream=self._make_default_stream(
261
+ stream=self._make_file_based_stream(
243
262
  stream_config=stream_config,
244
263
  cursor=cursor,
245
264
  parsed_config=parsed_config,
@@ -270,7 +289,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
270
289
  CursorField(DefaultFileBasedStream.ab_last_mod_col),
271
290
  )
272
291
  stream = FileBasedStreamFacade.create_from_stream(
273
- stream=self._make_default_stream(
292
+ stream=self._make_file_based_stream(
274
293
  stream_config=stream_config,
275
294
  cursor=cursor,
276
295
  parsed_config=parsed_config,
@@ -282,13 +301,17 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
282
301
  )
283
302
  else:
284
303
  cursor = self.cursor_cls(stream_config)
285
- stream = self._make_default_stream(
304
+ stream = self._make_file_based_stream(
286
305
  stream_config=stream_config,
287
306
  cursor=cursor,
288
307
  parsed_config=parsed_config,
289
308
  )
290
309
 
291
310
  streams.append(stream)
311
+
312
+ if include_identities_stream(parsed_config):
313
+ identities_stream = self._make_identities_stream()
314
+ streams.append(identities_stream)
292
315
  return streams
293
316
 
294
317
  except ValidationError as exc:
@@ -310,8 +333,48 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
310
333
  validation_policy=self._validate_and_get_validation_policy(stream_config),
311
334
  errors_collector=self.errors_collector,
312
335
  cursor=cursor,
313
- use_file_transfer=self._use_file_transfer(parsed_config),
314
- preserve_directory_structure=self._preserve_directory_structure(parsed_config),
336
+ use_file_transfer=use_file_transfer(parsed_config),
337
+ preserve_directory_structure=preserve_directory_structure(parsed_config),
338
+ )
339
+
340
+ def _make_permissions_stream(
341
+ self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
342
+ ) -> AbstractFileBasedStream:
343
+ return PermissionsFileBasedStream(
344
+ config=stream_config,
345
+ catalog_schema=self.stream_schemas.get(stream_config.name),
346
+ stream_reader=self.stream_reader,
347
+ availability_strategy=self.availability_strategy,
348
+ discovery_policy=self.discovery_policy,
349
+ parsers=self.parsers,
350
+ validation_policy=self._validate_and_get_validation_policy(stream_config),
351
+ errors_collector=self.errors_collector,
352
+ cursor=cursor,
353
+ )
354
+
355
+ def _make_file_based_stream(
356
+ self,
357
+ stream_config: FileBasedStreamConfig,
358
+ cursor: Optional[AbstractFileBasedCursor],
359
+ parsed_config: AbstractFileBasedSpec,
360
+ ) -> AbstractFileBasedStream:
361
+ """
362
+ Creates different streams depending on the type of the transfer mode selected
363
+ """
364
+ if use_permissions_transfer(parsed_config):
365
+ return self._make_permissions_stream(stream_config, cursor)
366
+ # we should have a stream for File transfer mode to decouple from DefaultFileBasedStream
367
+ else:
368
+ return self._make_default_stream(stream_config, cursor, parsed_config)
369
+
370
+ def _make_identities_stream(
371
+ self,
372
+ ) -> Stream:
373
+ return FileIdentitiesStream(
374
+ catalog_schema=self.stream_schemas.get(FileIdentitiesStream.IDENTITIES_STREAM_NAME),
375
+ stream_reader=self.stream_reader,
376
+ discovery_policy=self.discovery_policy,
377
+ errors_collector=self.errors_collector,
315
378
  )
316
379
 
317
380
  def _get_stream_from_catalog(
@@ -378,33 +441,3 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
378
441
  "`input_schema` and `schemaless` options cannot both be set",
379
442
  model=FileBasedStreamConfig,
380
443
  )
381
-
382
- @staticmethod
383
- def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
384
- use_file_transfer = (
385
- hasattr(parsed_config.delivery_method, "delivery_type")
386
- and parsed_config.delivery_method.delivery_type == "use_file_transfer"
387
- )
388
- return use_file_transfer
389
-
390
- @staticmethod
391
- def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
- """
393
- Determines whether to preserve directory structure during file transfer.
394
-
395
- When enabled, files maintain their subdirectory paths in the destination.
396
- When disabled, files are flattened to the root of the destination.
397
-
398
- Args:
399
- parsed_config: The parsed configuration containing delivery method settings
400
-
401
- Returns:
402
- True if directory structure should be preserved (default), False otherwise
403
- """
404
- if (
405
- FileBasedSource._use_file_transfer(parsed_config)
406
- and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
- and parsed_config.delivery_method.preserve_directory_structure is not None
408
- ):
409
- return parsed_config.delivery_method.preserve_directory_structure
410
- return True