airbyte-cdk 0.54.0__py3-none-any.whl → 0.55.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/concurrent_source/__init__.py +3 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +190 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +161 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +63 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +17 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +97 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +4 -4
- airbyte_cdk/sources/streams/concurrent/adapters.py +34 -12
- airbyte_cdk/sources/streams/concurrent/default_stream.py +79 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +7 -7
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +23 -0
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +4 -3
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +2 -3
- airbyte_cdk/sources/utils/slice_logger.py +5 -0
- {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/RECORD +35 -23
- unit_tests/sources/concurrent_source/__init__.py +3 -0
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +105 -0
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +14 -7
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +2 -3
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +44 -55
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +24 -15
- unit_tests/sources/streams/concurrent/test_adapters.py +52 -32
- unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +6 -5
- unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +604 -0
- unit_tests/sources/streams/concurrent/test_cursor.py +1 -1
- unit_tests/sources/streams/concurrent/{test_thread_based_concurrent_stream.py → test_default_stream.py} +7 -144
- unit_tests/sources/streams/concurrent/test_partition_reader.py +2 -2
- unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +98 -0
- unit_tests/sources/streams/test_stream_read.py +1 -2
- unit_tests/sources/test_concurrent_source.py +105 -0
- unit_tests/sources/test_source_read.py +461 -0
- airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +0 -221
- {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
import logging
|
5
|
+
from typing import Dict, Iterable, List, Optional, Set
|
6
|
+
|
7
|
+
from airbyte_cdk.models import AirbyteMessage, AirbyteStreamStatus
|
8
|
+
from airbyte_cdk.models import Type as MessageType
|
9
|
+
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
|
10
|
+
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
11
|
+
from airbyte_cdk.sources.message import MessageRepository
|
12
|
+
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
13
|
+
from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
|
14
|
+
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
15
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
16
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
17
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel
|
18
|
+
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
19
|
+
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
20
|
+
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message
|
21
|
+
|
22
|
+
|
23
|
+
class ConcurrentReadProcessor:
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
stream_instances_to_read_from: List[AbstractStream],
|
27
|
+
partition_enqueuer: PartitionEnqueuer,
|
28
|
+
thread_pool_manager: ThreadPoolManager,
|
29
|
+
logger: logging.Logger,
|
30
|
+
slice_logger: SliceLogger,
|
31
|
+
message_repository: MessageRepository,
|
32
|
+
partition_reader: PartitionReader,
|
33
|
+
):
|
34
|
+
"""
|
35
|
+
This class is responsible for handling items from a concurrent stream read process.
|
36
|
+
:param stream_instances_to_read_from: List of streams to read from
|
37
|
+
:param partition_enqueuer: PartitionEnqueuer instance
|
38
|
+
:param thread_pool_manager: ThreadPoolManager instance
|
39
|
+
:param logger: Logger instance
|
40
|
+
:param slice_logger: SliceLogger instance
|
41
|
+
:param message_repository: MessageRepository instance
|
42
|
+
:param partition_reader: PartitionReader instance
|
43
|
+
"""
|
44
|
+
self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from}
|
45
|
+
self._record_counter = {}
|
46
|
+
self._streams_to_partitions: Dict[str, Set[Partition]] = {}
|
47
|
+
for stream in stream_instances_to_read_from:
|
48
|
+
self._streams_to_partitions[stream.name] = set()
|
49
|
+
self._record_counter[stream.name] = 0
|
50
|
+
self._thread_pool_manager = thread_pool_manager
|
51
|
+
self._partition_enqueuer = partition_enqueuer
|
52
|
+
self._stream_instances_to_start_partition_generation = stream_instances_to_read_from
|
53
|
+
self._streams_currently_generating_partitions: List[str] = []
|
54
|
+
self._logger = logger
|
55
|
+
self._slice_logger = slice_logger
|
56
|
+
self._message_repository = message_repository
|
57
|
+
self._partition_reader = partition_reader
|
58
|
+
|
59
|
+
def on_partition_generation_completed(self, sentinel: PartitionGenerationCompletedSentinel) -> Iterable[AirbyteMessage]:
|
60
|
+
"""
|
61
|
+
This method is called when a partition generation is completed.
|
62
|
+
1. Remove the stream from the list of streams currently generating partitions
|
63
|
+
2. If the stream is done, mark it as such and return a stream status message
|
64
|
+
3. If there are more streams to read from, start the next partition generator
|
65
|
+
"""
|
66
|
+
stream_name = sentinel.stream.name
|
67
|
+
self._streams_currently_generating_partitions.remove(sentinel.stream.name)
|
68
|
+
ret = []
|
69
|
+
# It is possible for the stream to already be done if no partitions were generated
|
70
|
+
if self._is_stream_done(stream_name):
|
71
|
+
ret.append(self._on_stream_is_done(stream_name))
|
72
|
+
if self._stream_instances_to_start_partition_generation:
|
73
|
+
ret.append(self.start_next_partition_generator())
|
74
|
+
return ret
|
75
|
+
|
76
|
+
def on_partition(self, partition: Partition) -> None:
|
77
|
+
"""
|
78
|
+
This method is called when a partition is generated.
|
79
|
+
1. Add the partition to the set of partitions for the stream
|
80
|
+
2. Log the slice if necessary
|
81
|
+
3. Submit the partition to the thread pool manager
|
82
|
+
"""
|
83
|
+
stream_name = partition.stream_name()
|
84
|
+
self._streams_to_partitions[stream_name].add(partition)
|
85
|
+
if self._slice_logger.should_log_slice_message(self._logger):
|
86
|
+
self._message_repository.emit_message(self._slice_logger.create_slice_log_message(partition.to_slice()))
|
87
|
+
self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
|
88
|
+
|
89
|
+
def on_partition_complete_sentinel(self, sentinel: PartitionCompleteSentinel) -> Iterable[AirbyteMessage]:
|
90
|
+
"""
|
91
|
+
This method is called when a partition is completed.
|
92
|
+
1. Close the partition
|
93
|
+
2. If the stream is done, mark it as such and return a stream status message
|
94
|
+
3. Emit messages that were added to the message repository
|
95
|
+
"""
|
96
|
+
partition = sentinel.partition
|
97
|
+
partition.close()
|
98
|
+
if self._is_stream_done(partition.stream_name()):
|
99
|
+
yield self._on_stream_is_done(partition.stream_name())
|
100
|
+
yield from self._message_repository.consume_queue()
|
101
|
+
|
102
|
+
def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
|
103
|
+
"""
|
104
|
+
This method is called when a record is read from a partition.
|
105
|
+
1. Convert the record to an AirbyteMessage
|
106
|
+
2. If this is the first record for the stream, mark the stream as RUNNING
|
107
|
+
3. Increment the record counter for the stream
|
108
|
+
4. Emit the message
|
109
|
+
5. Emit messages that were added to the message repository
|
110
|
+
"""
|
111
|
+
# Do not pass a transformer or a schema
|
112
|
+
# AbstractStreams are expected to return data as they are expected.
|
113
|
+
# Any transformation on the data should be done before reaching this point
|
114
|
+
message = stream_data_to_airbyte_message(record.stream_name, record.data)
|
115
|
+
stream = self._stream_name_to_instance[record.stream_name]
|
116
|
+
|
117
|
+
if self._record_counter[stream.name] == 0:
|
118
|
+
self._logger.info(f"Marking stream {stream.name} as RUNNING")
|
119
|
+
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
|
120
|
+
|
121
|
+
if message.type == MessageType.RECORD:
|
122
|
+
self._record_counter[stream.name] += 1
|
123
|
+
yield message
|
124
|
+
yield from self._message_repository.consume_queue()
|
125
|
+
|
126
|
+
def on_exception(self, exception: Exception) -> Iterable[AirbyteMessage]:
|
127
|
+
"""
|
128
|
+
This method is called when an exception is raised.
|
129
|
+
1. Stop all running streams
|
130
|
+
2. Raise the exception
|
131
|
+
"""
|
132
|
+
yield from self._stop_streams()
|
133
|
+
raise exception
|
134
|
+
|
135
|
+
def start_next_partition_generator(self) -> Optional[AirbyteMessage]:
|
136
|
+
"""
|
137
|
+
Start the next partition generator.
|
138
|
+
1. Pop the next stream to read from
|
139
|
+
2. Submit the partition generator to the thread pool manager
|
140
|
+
3. Add the stream to the list of streams currently generating partitions
|
141
|
+
4. Return a stream status message
|
142
|
+
"""
|
143
|
+
if self._stream_instances_to_start_partition_generation:
|
144
|
+
stream = self._stream_instances_to_start_partition_generation.pop(0)
|
145
|
+
self._thread_pool_manager.submit(self._partition_enqueuer.generate_partitions, stream)
|
146
|
+
self._streams_currently_generating_partitions.append(stream.name)
|
147
|
+
self._logger.info(f"Marking stream {stream.name} as STARTED")
|
148
|
+
self._logger.info(f"Syncing stream: {stream.name} ")
|
149
|
+
return stream_status_as_airbyte_message(
|
150
|
+
stream.as_airbyte_stream(),
|
151
|
+
AirbyteStreamStatus.STARTED,
|
152
|
+
)
|
153
|
+
else:
|
154
|
+
return None
|
155
|
+
|
156
|
+
def is_done(self) -> bool:
|
157
|
+
"""
|
158
|
+
This method is called to check if the sync is done.
|
159
|
+
The sync is done when:
|
160
|
+
1. There are no more streams generating partitions
|
161
|
+
2. There are no more streams to read from
|
162
|
+
3. All partitions for all streams are closed
|
163
|
+
"""
|
164
|
+
return (
|
165
|
+
not self._streams_currently_generating_partitions
|
166
|
+
and not self._stream_instances_to_start_partition_generation
|
167
|
+
and all([all(p.is_closed() for p in partitions) for partitions in self._streams_to_partitions.values()])
|
168
|
+
)
|
169
|
+
|
170
|
+
def _is_stream_done(self, stream_name: str) -> bool:
|
171
|
+
return (
|
172
|
+
all([p.is_closed() for p in self._streams_to_partitions[stream_name]])
|
173
|
+
and stream_name not in self._streams_currently_generating_partitions
|
174
|
+
)
|
175
|
+
|
176
|
+
def _on_stream_is_done(self, stream_name: str) -> AirbyteMessage:
|
177
|
+
self._logger.info(f"Read {self._record_counter[stream_name]} records from {stream_name} stream")
|
178
|
+
self._logger.info(f"Marking stream {stream_name} as STOPPED")
|
179
|
+
stream = self._stream_name_to_instance[stream_name]
|
180
|
+
self._logger.info(f"Finished syncing {stream.name}")
|
181
|
+
return stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.COMPLETE)
|
182
|
+
|
183
|
+
def _stop_streams(self) -> Iterable[AirbyteMessage]:
|
184
|
+
self._thread_pool_manager.shutdown()
|
185
|
+
for stream_name, partitions in self._streams_to_partitions.items():
|
186
|
+
stream = self._stream_name_to_instance[stream_name]
|
187
|
+
if not all([p.is_closed() for p in partitions]):
|
188
|
+
self._logger.info(f"Marking stream {stream.name} as STOPPED")
|
189
|
+
self._logger.info(f"Finished syncing {stream.name}")
|
190
|
+
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.INCOMPLETE)
|
@@ -0,0 +1,161 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
import concurrent
|
5
|
+
import logging
|
6
|
+
from queue import Queue
|
7
|
+
from typing import Iterable, Iterator, List
|
8
|
+
|
9
|
+
from airbyte_cdk.models import AirbyteMessage
|
10
|
+
from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
|
11
|
+
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
|
12
|
+
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
13
|
+
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
14
|
+
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
15
|
+
from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
|
16
|
+
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
17
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
18
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
19
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
|
20
|
+
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
21
|
+
|
22
|
+
|
23
|
+
class ConcurrentSource:
|
24
|
+
"""
|
25
|
+
A Source that reads data from multiple AbstractStreams concurrently.
|
26
|
+
It does so by submitting partition generation, and partition read tasks to a thread pool.
|
27
|
+
The tasks asynchronously add their output to a shared queue.
|
28
|
+
The read is done when all partitions for all streams were generated and read.
|
29
|
+
"""
|
30
|
+
|
31
|
+
DEFAULT_TIMEOUT_SECONDS = 900
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def create(
|
35
|
+
num_workers: int,
|
36
|
+
initial_number_of_partitions_to_generate: int,
|
37
|
+
logger: logging.Logger,
|
38
|
+
slice_logger: SliceLogger,
|
39
|
+
message_repository: MessageRepository,
|
40
|
+
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
41
|
+
) -> "ConcurrentSource":
|
42
|
+
threadpool = ThreadPoolManager(
|
43
|
+
concurrent.futures.ThreadPoolExecutor(max_workers=num_workers, thread_name_prefix="workerpool"), logger, num_workers
|
44
|
+
)
|
45
|
+
return ConcurrentSource(
|
46
|
+
threadpool, logger, slice_logger, message_repository, initial_number_of_partitions_to_generate, timeout_seconds
|
47
|
+
)
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
threadpool: ThreadPoolManager,
|
52
|
+
logger: logging.Logger,
|
53
|
+
slice_logger: SliceLogger = DebugSliceLogger(),
|
54
|
+
message_repository: MessageRepository = InMemoryMessageRepository(),
|
55
|
+
initial_number_partitions_to_generate: int = 1,
|
56
|
+
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
57
|
+
) -> None:
|
58
|
+
"""
|
59
|
+
:param threadpool: The threadpool to submit tasks to
|
60
|
+
:param logger: The logger to log to
|
61
|
+
:param slice_logger: The slice logger used to create messages on new slices
|
62
|
+
:param message_repository: The repository to emit messages to
|
63
|
+
:param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible.
|
64
|
+
:param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return.
|
65
|
+
"""
|
66
|
+
self._threadpool = threadpool
|
67
|
+
self._logger = logger
|
68
|
+
self._slice_logger = slice_logger
|
69
|
+
self._message_repository = message_repository
|
70
|
+
self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
|
71
|
+
self._timeout_seconds = timeout_seconds
|
72
|
+
|
73
|
+
def read(
|
74
|
+
self,
|
75
|
+
streams: List[AbstractStream],
|
76
|
+
) -> Iterator[AirbyteMessage]:
|
77
|
+
self._logger.info("Starting syncing")
|
78
|
+
stream_instances_to_read_from = self._get_streams_to_read_from(streams)
|
79
|
+
|
80
|
+
# Return early if there are no streams to read from
|
81
|
+
if not stream_instances_to_read_from:
|
82
|
+
return
|
83
|
+
|
84
|
+
queue: Queue[QueueItem] = Queue()
|
85
|
+
concurrent_stream_processor = ConcurrentReadProcessor(
|
86
|
+
stream_instances_to_read_from,
|
87
|
+
PartitionEnqueuer(queue),
|
88
|
+
self._threadpool,
|
89
|
+
self._logger,
|
90
|
+
self._slice_logger,
|
91
|
+
self._message_repository,
|
92
|
+
PartitionReader(queue),
|
93
|
+
)
|
94
|
+
|
95
|
+
# Enqueue initial partition generation tasks
|
96
|
+
yield from self._submit_initial_partition_generators(concurrent_stream_processor)
|
97
|
+
|
98
|
+
# Read from the queue until all partitions were generated and read
|
99
|
+
yield from self._consume_from_queue(
|
100
|
+
queue,
|
101
|
+
concurrent_stream_processor,
|
102
|
+
)
|
103
|
+
self._threadpool.check_for_errors_and_shutdown()
|
104
|
+
self._logger.info("Finished syncing")
|
105
|
+
|
106
|
+
def _submit_initial_partition_generators(self, concurrent_stream_processor: ConcurrentReadProcessor) -> Iterable[AirbyteMessage]:
|
107
|
+
for _ in range(self._initial_number_partitions_to_generate):
|
108
|
+
status_message = concurrent_stream_processor.start_next_partition_generator()
|
109
|
+
if status_message:
|
110
|
+
yield status_message
|
111
|
+
|
112
|
+
def _consume_from_queue(
|
113
|
+
self,
|
114
|
+
queue: Queue[QueueItem],
|
115
|
+
concurrent_stream_processor: ConcurrentReadProcessor,
|
116
|
+
) -> Iterable[AirbyteMessage]:
|
117
|
+
while airbyte_message_or_record_or_exception := queue.get(block=True, timeout=self._timeout_seconds):
|
118
|
+
yield from self._handle_item(
|
119
|
+
airbyte_message_or_record_or_exception,
|
120
|
+
concurrent_stream_processor,
|
121
|
+
)
|
122
|
+
if concurrent_stream_processor.is_done() and queue.empty():
|
123
|
+
# all partitions were generated and processed. we're done here
|
124
|
+
break
|
125
|
+
|
126
|
+
def _handle_item(
|
127
|
+
self,
|
128
|
+
queue_item: QueueItem,
|
129
|
+
concurrent_stream_processor: ConcurrentReadProcessor,
|
130
|
+
) -> Iterable[AirbyteMessage]:
|
131
|
+
# handle queue item and call the appropriate handler depending on the type of the queue item
|
132
|
+
if isinstance(queue_item, Exception):
|
133
|
+
yield from concurrent_stream_processor.on_exception(queue_item)
|
134
|
+
|
135
|
+
elif isinstance(queue_item, PartitionGenerationCompletedSentinel):
|
136
|
+
yield from concurrent_stream_processor.on_partition_generation_completed(queue_item)
|
137
|
+
|
138
|
+
elif isinstance(queue_item, Partition):
|
139
|
+
concurrent_stream_processor.on_partition(queue_item)
|
140
|
+
elif isinstance(queue_item, PartitionCompleteSentinel):
|
141
|
+
yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
|
142
|
+
elif isinstance(queue_item, Record):
|
143
|
+
yield from concurrent_stream_processor.on_record(queue_item)
|
144
|
+
else:
|
145
|
+
raise ValueError(f"Unknown queue item type: {type(queue_item)}")
|
146
|
+
|
147
|
+
def _get_streams_to_read_from(self, streams: List[AbstractStream]) -> List[AbstractStream]:
|
148
|
+
"""
|
149
|
+
Iterate over the configured streams and return a list of streams to read from.
|
150
|
+
If a stream is not configured, it will be skipped.
|
151
|
+
If a stream is configured but does not exist in the source and self.raise_exception_on_missing_stream is True, an exception will be raised
|
152
|
+
If a stream is not available, it will be skipped
|
153
|
+
"""
|
154
|
+
stream_instances_to_read_from = []
|
155
|
+
for stream in streams:
|
156
|
+
stream_availability = stream.check_availability()
|
157
|
+
if not stream_availability.is_available():
|
158
|
+
self._logger.warning(f"Skipped syncing stream '{stream.name}' because it was unavailable. {stream_availability.message()}")
|
159
|
+
continue
|
160
|
+
stream_instances_to_read_from.append(stream)
|
161
|
+
return stream_instances_to_read_from
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
import logging
|
5
|
+
from abc import ABC
|
6
|
+
from typing import Any, Iterator, List, Mapping, MutableMapping, Optional, Union
|
7
|
+
|
8
|
+
from airbyte_cdk.models import AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog
|
9
|
+
from airbyte_cdk.sources import AbstractSource
|
10
|
+
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
11
|
+
from airbyte_cdk.sources.streams import Stream
|
12
|
+
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
13
|
+
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
14
|
+
|
15
|
+
|
16
|
+
class ConcurrentSourceAdapter(AbstractSource, ABC):
|
17
|
+
def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None:
|
18
|
+
"""
|
19
|
+
ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source.
|
20
|
+
|
21
|
+
The source's streams are still defined through the streams() method.
|
22
|
+
Streams wrapped in a StreamFacade will be processed concurrently.
|
23
|
+
Other streams will be processed sequentially as a later step.
|
24
|
+
"""
|
25
|
+
self._concurrent_source = concurrent_source
|
26
|
+
super().__init__(**kwargs)
|
27
|
+
|
28
|
+
def read(
|
29
|
+
self,
|
30
|
+
logger: logging.Logger,
|
31
|
+
config: Mapping[str, Any],
|
32
|
+
catalog: ConfiguredAirbyteCatalog,
|
33
|
+
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
|
34
|
+
) -> Iterator[AirbyteMessage]:
|
35
|
+
abstract_streams = self._select_abstract_streams(config, catalog)
|
36
|
+
concurrent_stream_names = {stream.name for stream in abstract_streams}
|
37
|
+
configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog(
|
38
|
+
streams=[stream for stream in catalog.streams if stream.stream.name not in concurrent_stream_names]
|
39
|
+
)
|
40
|
+
if abstract_streams:
|
41
|
+
yield from self._concurrent_source.read(abstract_streams)
|
42
|
+
if configured_catalog_for_regular_streams.streams:
|
43
|
+
yield from super().read(logger, config, configured_catalog_for_regular_streams, state)
|
44
|
+
|
45
|
+
def _select_abstract_streams(self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog) -> List[AbstractStream]:
|
46
|
+
"""
|
47
|
+
Selects streams that can be processed concurrently and returns their abstract representations.
|
48
|
+
"""
|
49
|
+
all_streams = self.streams(config)
|
50
|
+
stream_name_to_instance: Mapping[str, Stream] = {s.name: s for s in all_streams}
|
51
|
+
abstract_streams: List[AbstractStream] = []
|
52
|
+
for configured_stream in configured_catalog.streams:
|
53
|
+
stream_instance = stream_name_to_instance.get(configured_stream.stream.name)
|
54
|
+
if not stream_instance:
|
55
|
+
if not self.raise_exception_on_missing_stream:
|
56
|
+
continue
|
57
|
+
raise KeyError(
|
58
|
+
f"The stream {configured_stream.stream.name} no longer exists in the configuration. "
|
59
|
+
f"Refresh the schema in replication settings and remove this stream from future sync attempts."
|
60
|
+
)
|
61
|
+
if isinstance(stream_instance, StreamFacade):
|
62
|
+
abstract_streams.append(stream_instance._abstract_stream)
|
63
|
+
return abstract_streams
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
5
|
+
|
6
|
+
|
7
|
+
class PartitionGenerationCompletedSentinel:
|
8
|
+
"""
|
9
|
+
A sentinel object indicating all partitions for a stream were produced.
|
10
|
+
Includes a pointer to the stream that was processed.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(self, stream: AbstractStream):
|
14
|
+
"""
|
15
|
+
:param stream: The stream that was processed
|
16
|
+
"""
|
17
|
+
self.stream = stream
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
import logging
|
5
|
+
import time
|
6
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
7
|
+
from typing import Any, Callable, List
|
8
|
+
|
9
|
+
|
10
|
+
class ThreadPoolManager:
|
11
|
+
"""
|
12
|
+
Wrapper to abstract away the threadpool and the logic to wait for pending tasks to be completed.
|
13
|
+
"""
|
14
|
+
|
15
|
+
DEFAULT_SLEEP_TIME = 0.1
|
16
|
+
DEFAULT_MAX_QUEUE_SIZE = 10_000
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
threadpool: ThreadPoolExecutor,
|
21
|
+
logger: logging.Logger,
|
22
|
+
max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
|
23
|
+
sleep_time: float = DEFAULT_SLEEP_TIME,
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
:param threadpool: The threadpool to use
|
27
|
+
:param logger: The logger to use
|
28
|
+
:param max_concurrent_tasks: The maximum number of tasks that can be pending at the same time
|
29
|
+
:param sleep_time: How long to sleep if there are too many pending tasks
|
30
|
+
"""
|
31
|
+
self._threadpool = threadpool
|
32
|
+
self._logger = logger
|
33
|
+
self._max_concurrent_tasks = max_concurrent_tasks
|
34
|
+
self._sleep_time = sleep_time
|
35
|
+
self._futures: List[Future[Any]] = []
|
36
|
+
|
37
|
+
def submit(self, function: Callable[..., Any], *args: Any) -> None:
|
38
|
+
# Submit a task to the threadpool, waiting if there are too many pending tasks
|
39
|
+
self._wait_while_too_many_pending_futures(self._futures)
|
40
|
+
self._futures.append(self._threadpool.submit(function, *args))
|
41
|
+
|
42
|
+
def _wait_while_too_many_pending_futures(self, futures: List[Future[Any]]) -> None:
|
43
|
+
# Wait until the number of pending tasks is < self._max_concurrent_tasks
|
44
|
+
while True:
|
45
|
+
self._prune_futures(futures)
|
46
|
+
if len(futures) < self._max_concurrent_tasks:
|
47
|
+
break
|
48
|
+
self._logger.info("Main thread is sleeping because the task queue is full...")
|
49
|
+
time.sleep(self._sleep_time)
|
50
|
+
|
51
|
+
def _prune_futures(self, futures: List[Future[Any]]) -> None:
|
52
|
+
"""
|
53
|
+
Take a list in input and remove the futures that are completed. If a future has an exception, it'll raise and kill the stream
|
54
|
+
operation.
|
55
|
+
|
56
|
+
Pruning this list safely relies on the assumptions that only the main thread can modify the list of futures.
|
57
|
+
"""
|
58
|
+
if len(futures) < self._max_concurrent_tasks:
|
59
|
+
return
|
60
|
+
|
61
|
+
for index in reversed(range(len(futures))):
|
62
|
+
future = futures[index]
|
63
|
+
optional_exception = future.exception()
|
64
|
+
if optional_exception:
|
65
|
+
exception = RuntimeError(f"Failed reading with error: {optional_exception}")
|
66
|
+
self._stop_and_raise_exception(exception)
|
67
|
+
|
68
|
+
if future.done():
|
69
|
+
futures.pop(index)
|
70
|
+
|
71
|
+
def shutdown(self) -> None:
|
72
|
+
self._threadpool.shutdown(wait=False, cancel_futures=True)
|
73
|
+
|
74
|
+
def is_done(self) -> bool:
|
75
|
+
return all([f.done() for f in self._futures])
|
76
|
+
|
77
|
+
def check_for_errors_and_shutdown(self) -> None:
|
78
|
+
"""
|
79
|
+
Check if any of the futures have an exception, and raise it if so. If all futures are done, shutdown the threadpool.
|
80
|
+
If the futures are not done, raise an exception.
|
81
|
+
:return:
|
82
|
+
"""
|
83
|
+
exceptions_from_futures = [f for f in [future.exception() for future in self._futures] if f is not None]
|
84
|
+
if exceptions_from_futures:
|
85
|
+
exception = RuntimeError(f"Failed reading with errors: {exceptions_from_futures}")
|
86
|
+
self._stop_and_raise_exception(exception)
|
87
|
+
else:
|
88
|
+
futures_not_done = [f for f in self._futures if not f.done()]
|
89
|
+
if futures_not_done:
|
90
|
+
exception = RuntimeError(f"Failed reading with futures not done: {futures_not_done}")
|
91
|
+
self._stop_and_raise_exception(exception)
|
92
|
+
else:
|
93
|
+
self.shutdown()
|
94
|
+
|
95
|
+
def _stop_and_raise_exception(self, exception: BaseException) -> None:
|
96
|
+
self.shutdown()
|
97
|
+
raise exception
|
@@ -7,7 +7,7 @@ from typing import Any, Iterable, Mapping, Optional
|
|
7
7
|
|
8
8
|
from airbyte_cdk.models import AirbyteStream
|
9
9
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability
|
10
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
11
11
|
from deprecated.classic import deprecated
|
12
12
|
|
13
13
|
|
@@ -37,10 +37,10 @@ class AbstractStream(ABC):
|
|
37
37
|
"""
|
38
38
|
|
39
39
|
@abstractmethod
|
40
|
-
def
|
40
|
+
def generate_partitions(self) -> Iterable[Partition]:
|
41
41
|
"""
|
42
|
-
|
43
|
-
:return:
|
42
|
+
Generates the partitions that will be read by this stream.
|
43
|
+
:return: An iterable of partitions.
|
44
44
|
"""
|
45
45
|
|
46
46
|
@property
|