airbyte-cdk 0.58.8__py3-none-any.whl → 0.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +20 -21
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +4 -3
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +15 -18
- airbyte_cdk/sources/concurrent_source/throttler.py +25 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +29 -8
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +3 -5
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +3 -4
- airbyte_cdk/sources/streams/concurrent/partitions/throttled_queue.py +41 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +6 -12
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +36 -30
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/RECORD +23 -19
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +2 -2
- unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +4 -10
- unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +82 -12
- unit_tests/sources/streams/concurrent/test_cursor.py +20 -3
- unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +166 -268
- unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +2 -15
- unit_tests/sources/streams/concurrent/test_throttled_queue.py +65 -0
- unit_tests/sources/streams/concurrent/test_throttler.py +13 -0
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/top_level.txt +0 -0
@@ -43,9 +43,9 @@ class ConcurrentReadProcessor:
|
|
43
43
|
"""
|
44
44
|
self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from}
|
45
45
|
self._record_counter = {}
|
46
|
-
self.
|
46
|
+
self._streams_to_running_partitions: Dict[str, Set[Partition]] = {}
|
47
47
|
for stream in stream_instances_to_read_from:
|
48
|
-
self.
|
48
|
+
self._streams_to_running_partitions[stream.name] = set()
|
49
49
|
self._record_counter[stream.name] = 0
|
50
50
|
self._thread_pool_manager = thread_pool_manager
|
51
51
|
self._partition_enqueuer = partition_enqueuer
|
@@ -55,6 +55,7 @@ class ConcurrentReadProcessor:
|
|
55
55
|
self._slice_logger = slice_logger
|
56
56
|
self._message_repository = message_repository
|
57
57
|
self._partition_reader = partition_reader
|
58
|
+
self._streams_done: Set[str] = set()
|
58
59
|
|
59
60
|
def on_partition_generation_completed(self, sentinel: PartitionGenerationCompletedSentinel) -> Iterable[AirbyteMessage]:
|
60
61
|
"""
|
@@ -67,7 +68,8 @@ class ConcurrentReadProcessor:
|
|
67
68
|
self._streams_currently_generating_partitions.remove(sentinel.stream.name)
|
68
69
|
ret = []
|
69
70
|
# It is possible for the stream to already be done if no partitions were generated
|
70
|
-
|
71
|
+
# If the partition generation process was completed and there are no partitions left to process, the stream is done
|
72
|
+
if self._is_stream_done(stream_name) or len(self._streams_to_running_partitions[stream_name]) == 0:
|
71
73
|
ret.append(self._on_stream_is_done(stream_name))
|
72
74
|
if self._stream_instances_to_start_partition_generation:
|
73
75
|
ret.append(self.start_next_partition_generator())
|
@@ -81,7 +83,7 @@ class ConcurrentReadProcessor:
|
|
81
83
|
3. Submit the partition to the thread pool manager
|
82
84
|
"""
|
83
85
|
stream_name = partition.stream_name()
|
84
|
-
self.
|
86
|
+
self._streams_to_running_partitions[stream_name].add(partition)
|
85
87
|
if self._slice_logger.should_log_slice_message(self._logger):
|
86
88
|
self._message_repository.emit_message(self._slice_logger.create_slice_log_message(partition.to_slice()))
|
87
89
|
self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
|
@@ -95,8 +97,12 @@ class ConcurrentReadProcessor:
|
|
95
97
|
"""
|
96
98
|
partition = sentinel.partition
|
97
99
|
partition.close()
|
98
|
-
|
99
|
-
|
100
|
+
partitions_running = self._streams_to_running_partitions[partition.stream_name()]
|
101
|
+
if partition in partitions_running:
|
102
|
+
partitions_running.remove(partition)
|
103
|
+
# If all partitions were generated and this was the last one, the stream is done
|
104
|
+
if partition.stream_name() not in self._streams_currently_generating_partitions and len(partitions_running) == 0:
|
105
|
+
yield self._on_stream_is_done(partition.stream_name())
|
100
106
|
yield from self._message_repository.consume_queue()
|
101
107
|
|
102
108
|
def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
|
@@ -114,11 +120,10 @@ class ConcurrentReadProcessor:
|
|
114
120
|
message = stream_data_to_airbyte_message(record.stream_name, record.data)
|
115
121
|
stream = self._stream_name_to_instance[record.stream_name]
|
116
122
|
|
117
|
-
if self._record_counter[stream.name] == 0:
|
118
|
-
self._logger.info(f"Marking stream {stream.name} as RUNNING")
|
119
|
-
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
|
120
|
-
|
121
123
|
if message.type == MessageType.RECORD:
|
124
|
+
if self._record_counter[stream.name] == 0:
|
125
|
+
self._logger.info(f"Marking stream {stream.name} as RUNNING")
|
126
|
+
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
|
122
127
|
self._record_counter[stream.name] += 1
|
123
128
|
yield message
|
124
129
|
yield from self._message_repository.consume_queue()
|
@@ -161,30 +166,24 @@ class ConcurrentReadProcessor:
|
|
161
166
|
2. There are no more streams to read from
|
162
167
|
3. All partitions for all streams are closed
|
163
168
|
"""
|
164
|
-
return (
|
165
|
-
not self._streams_currently_generating_partitions
|
166
|
-
and not self._stream_instances_to_start_partition_generation
|
167
|
-
and all([all(p.is_closed() for p in partitions) for partitions in self._streams_to_partitions.values()])
|
168
|
-
)
|
169
|
+
return all([self._is_stream_done(stream_name) for stream_name in self._stream_name_to_instance.keys()])
|
169
170
|
|
170
171
|
def _is_stream_done(self, stream_name: str) -> bool:
|
171
|
-
return
|
172
|
-
all([p.is_closed() for p in self._streams_to_partitions[stream_name]])
|
173
|
-
and stream_name not in self._streams_currently_generating_partitions
|
174
|
-
)
|
172
|
+
return stream_name in self._streams_done
|
175
173
|
|
176
174
|
def _on_stream_is_done(self, stream_name: str) -> AirbyteMessage:
|
177
175
|
self._logger.info(f"Read {self._record_counter[stream_name]} records from {stream_name} stream")
|
178
176
|
self._logger.info(f"Marking stream {stream_name} as STOPPED")
|
179
177
|
stream = self._stream_name_to_instance[stream_name]
|
180
178
|
self._logger.info(f"Finished syncing {stream.name}")
|
179
|
+
self._streams_done.add(stream_name)
|
181
180
|
return stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.COMPLETE)
|
182
181
|
|
183
182
|
def _stop_streams(self) -> Iterable[AirbyteMessage]:
|
184
183
|
self._thread_pool_manager.shutdown()
|
185
|
-
for stream_name
|
184
|
+
for stream_name in self._streams_to_running_partitions.keys():
|
186
185
|
stream = self._stream_name_to_instance[stream_name]
|
187
|
-
if not
|
186
|
+
if not self._is_stream_done(stream_name):
|
188
187
|
self._logger.info(f"Marking stream {stream.name} as STOPPED")
|
189
188
|
self._logger.info(f"Finished syncing {stream.name}")
|
190
189
|
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.INCOMPLETE)
|
@@ -16,6 +16,7 @@ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionE
|
|
16
16
|
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
17
17
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
18
18
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
19
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
|
19
20
|
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
|
20
21
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
21
22
|
|
@@ -82,7 +83,7 @@ class ConcurrentSource:
|
|
82
83
|
if not stream_instances_to_read_from:
|
83
84
|
return
|
84
85
|
|
85
|
-
queue:
|
86
|
+
queue: ThrottledQueue = ThrottledQueue(Queue(), self._threadpool.get_throttler(), self._timeout_seconds)
|
86
87
|
concurrent_stream_processor = ConcurrentReadProcessor(
|
87
88
|
stream_instances_to_read_from,
|
88
89
|
PartitionEnqueuer(queue),
|
@@ -112,10 +113,10 @@ class ConcurrentSource:
|
|
112
113
|
|
113
114
|
def _consume_from_queue(
|
114
115
|
self,
|
115
|
-
queue:
|
116
|
+
queue: ThrottledQueue,
|
116
117
|
concurrent_stream_processor: ConcurrentReadProcessor,
|
117
118
|
) -> Iterable[AirbyteMessage]:
|
118
|
-
while airbyte_message_or_record_or_exception := queue.get(
|
119
|
+
while airbyte_message_or_record_or_exception := queue.get():
|
119
120
|
yield from self._handle_item(
|
120
121
|
airbyte_message_or_record_or_exception,
|
121
122
|
concurrent_stream_processor,
|
@@ -2,10 +2,11 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
import time
|
6
5
|
from concurrent.futures import Future, ThreadPoolExecutor
|
7
6
|
from typing import Any, Callable, List
|
8
7
|
|
8
|
+
from airbyte_cdk.sources.concurrent_source.throttler import Throttler
|
9
|
+
|
9
10
|
|
10
11
|
class ThreadPoolManager:
|
11
12
|
"""
|
@@ -19,8 +20,8 @@ class ThreadPoolManager:
|
|
19
20
|
self,
|
20
21
|
threadpool: ThreadPoolExecutor,
|
21
22
|
logger: logging.Logger,
|
22
|
-
max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
|
23
23
|
sleep_time: float = DEFAULT_SLEEP_TIME,
|
24
|
+
max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
|
24
25
|
):
|
25
26
|
"""
|
26
27
|
:param threadpool: The threadpool to use
|
@@ -31,23 +32,17 @@ class ThreadPoolManager:
|
|
31
32
|
self._threadpool = threadpool
|
32
33
|
self._logger = logger
|
33
34
|
self._max_concurrent_tasks = max_concurrent_tasks
|
34
|
-
self._sleep_time = sleep_time
|
35
35
|
self._futures: List[Future[Any]] = []
|
36
|
+
self._throttler = Throttler(self._futures, sleep_time, max_concurrent_tasks)
|
37
|
+
|
38
|
+
def get_throttler(self) -> Throttler:
|
39
|
+
return self._throttler
|
36
40
|
|
37
41
|
def submit(self, function: Callable[..., Any], *args: Any) -> None:
|
38
|
-
# Submit a task to the threadpool,
|
39
|
-
self.
|
42
|
+
# Submit a task to the threadpool, removing completed tasks if there are too many tasks in self._futures.
|
43
|
+
self._prune_futures(self._futures)
|
40
44
|
self._futures.append(self._threadpool.submit(function, *args))
|
41
45
|
|
42
|
-
def _wait_while_too_many_pending_futures(self, futures: List[Future[Any]]) -> None:
|
43
|
-
# Wait until the number of pending tasks is < self._max_concurrent_tasks
|
44
|
-
while True:
|
45
|
-
self._prune_futures(futures)
|
46
|
-
if len(futures) < self._max_concurrent_tasks:
|
47
|
-
break
|
48
|
-
self._logger.info("Main thread is sleeping because the task queue is full...")
|
49
|
-
time.sleep(self._sleep_time)
|
50
|
-
|
51
46
|
def _prune_futures(self, futures: List[Future[Any]]) -> None:
|
52
47
|
"""
|
53
48
|
Take a list in input and remove the futures that are completed. If a future has an exception, it'll raise and kill the stream
|
@@ -60,12 +55,14 @@ class ThreadPoolManager:
|
|
60
55
|
|
61
56
|
for index in reversed(range(len(futures))):
|
62
57
|
future = futures[index]
|
63
|
-
optional_exception = future.exception()
|
64
|
-
if optional_exception:
|
65
|
-
exception = RuntimeError(f"Failed reading with error: {optional_exception}")
|
66
|
-
self._stop_and_raise_exception(exception)
|
67
58
|
|
68
59
|
if future.done():
|
60
|
+
# Only call future.exception() if the future is known to be done because it will block until the future is done.
|
61
|
+
# See https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Future.exception
|
62
|
+
optional_exception = future.exception()
|
63
|
+
if optional_exception:
|
64
|
+
exception = RuntimeError(f"Failed reading with error: {optional_exception}")
|
65
|
+
self._stop_and_raise_exception(exception)
|
69
66
|
futures.pop(index)
|
70
67
|
|
71
68
|
def shutdown(self) -> None:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
import time
|
4
|
+
from concurrent.futures import Future
|
5
|
+
from typing import Any, List
|
6
|
+
|
7
|
+
|
8
|
+
class Throttler:
|
9
|
+
"""
|
10
|
+
A throttler that waits until the number of concurrent tasks is below a certain threshold.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(self, futures_list: List[Future[Any]], sleep_time: float, max_concurrent_tasks: int):
|
14
|
+
"""
|
15
|
+
:param futures_list: The list of futures to monitor
|
16
|
+
:param sleep_time: How long to sleep if there are too many pending tasks
|
17
|
+
:param max_concurrent_tasks: The maximum number of tasks that can be pending at the same time
|
18
|
+
"""
|
19
|
+
self._futures_list = futures_list
|
20
|
+
self._sleep_time = sleep_time
|
21
|
+
self._max_concurrent_tasks = max_concurrent_tasks
|
22
|
+
|
23
|
+
def wait_and_acquire(self) -> None:
|
24
|
+
while len(self._futures_list) >= self._max_concurrent_tasks:
|
25
|
+
time.sleep(self._sleep_time)
|
@@ -3,7 +3,8 @@
|
|
3
3
|
#
|
4
4
|
import functools
|
5
5
|
from abc import ABC, abstractmethod
|
6
|
-
from
|
6
|
+
from datetime import datetime
|
7
|
+
from typing import Any, List, Mapping, MutableMapping, Optional, Protocol, Tuple
|
7
8
|
|
8
9
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
9
10
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -36,6 +37,11 @@ class CursorField:
|
|
36
37
|
|
37
38
|
|
38
39
|
class Cursor(ABC):
|
40
|
+
@property
|
41
|
+
@abstractmethod
|
42
|
+
def state(self) -> MutableMapping[str, Any]:
|
43
|
+
...
|
44
|
+
|
39
45
|
@abstractmethod
|
40
46
|
def observe(self, record: Record) -> None:
|
41
47
|
"""
|
@@ -52,6 +58,10 @@ class Cursor(ABC):
|
|
52
58
|
|
53
59
|
|
54
60
|
class NoopCursor(Cursor):
|
61
|
+
@property
|
62
|
+
def state(self) -> MutableMapping[str, Any]:
|
63
|
+
return {}
|
64
|
+
|
55
65
|
def observe(self, record: Record) -> None:
|
56
66
|
pass
|
57
67
|
|
@@ -73,6 +83,7 @@ class ConcurrentCursor(Cursor):
|
|
73
83
|
connector_state_converter: AbstractStreamStateConverter,
|
74
84
|
cursor_field: CursorField,
|
75
85
|
slice_boundary_fields: Optional[Tuple[str, str]],
|
86
|
+
start: Optional[Any],
|
76
87
|
) -> None:
|
77
88
|
self._stream_name = stream_name
|
78
89
|
self._stream_namespace = stream_namespace
|
@@ -82,9 +93,19 @@ class ConcurrentCursor(Cursor):
|
|
82
93
|
self._cursor_field = cursor_field
|
83
94
|
# To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379
|
84
95
|
self._slice_boundary_fields = slice_boundary_fields if slice_boundary_fields else tuple()
|
96
|
+
self._start = start
|
85
97
|
self._most_recent_record: Optional[Record] = None
|
86
98
|
self._has_closed_at_least_one_slice = False
|
87
|
-
self.
|
99
|
+
self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
|
100
|
+
|
101
|
+
@property
|
102
|
+
def state(self) -> MutableMapping[str, Any]:
|
103
|
+
return self._concurrent_state
|
104
|
+
|
105
|
+
def _get_concurrent_state(self, state: MutableMapping[str, Any]) -> Tuple[datetime, MutableMapping[str, Any]]:
|
106
|
+
if self._connector_state_converter.is_state_message_compatible(state):
|
107
|
+
return self._start or self._connector_state_converter.zero_value, self._connector_state_converter.deserialize(state)
|
108
|
+
return self._connector_state_converter.convert_from_sequential_state(self._cursor_field, state, self._start)
|
88
109
|
|
89
110
|
def observe(self, record: Record) -> None:
|
90
111
|
if self._slice_boundary_fields:
|
@@ -102,7 +123,7 @@ class ConcurrentCursor(Cursor):
|
|
102
123
|
def close_partition(self, partition: Partition) -> None:
|
103
124
|
slice_count_before = len(self.state.get("slices", []))
|
104
125
|
self._add_slice_to_state(partition)
|
105
|
-
if slice_count_before < len(self.state["slices"]):
|
126
|
+
if slice_count_before < len(self.state["slices"]): # only emit if at least one slice has been processed
|
106
127
|
self._merge_partitions()
|
107
128
|
self._emit_state_message()
|
108
129
|
self._has_closed_at_least_one_slice = True
|
@@ -110,7 +131,9 @@ class ConcurrentCursor(Cursor):
|
|
110
131
|
def _add_slice_to_state(self, partition: Partition) -> None:
|
111
132
|
if self._slice_boundary_fields:
|
112
133
|
if "slices" not in self.state:
|
113
|
-
|
134
|
+
raise RuntimeError(
|
135
|
+
f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
|
136
|
+
)
|
114
137
|
self.state["slices"].append(
|
115
138
|
{
|
116
139
|
"start": self._extract_from_slice(partition, self._slice_boundary_fields[self._START_BOUNDARY]),
|
@@ -126,10 +149,8 @@ class ConcurrentCursor(Cursor):
|
|
126
149
|
|
127
150
|
self.state["slices"].append(
|
128
151
|
{
|
129
|
-
|
130
|
-
|
131
|
-
"start": self._connector_state_converter.zero_value,
|
132
|
-
"end": self._extract_cursor_value(self._most_recent_record),
|
152
|
+
self._connector_state_converter.START_KEY: self.start,
|
153
|
+
self._connector_state_converter.END_KEY: self._extract_cursor_value(self._most_recent_record),
|
133
154
|
}
|
134
155
|
)
|
135
156
|
|
@@ -2,11 +2,9 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from queue import Queue
|
6
|
-
|
7
5
|
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
|
8
6
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
9
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
|
10
8
|
|
11
9
|
|
12
10
|
class PartitionEnqueuer:
|
@@ -14,10 +12,10 @@ class PartitionEnqueuer:
|
|
14
12
|
Generates partitions from a partition generator and puts them in a queue.
|
15
13
|
"""
|
16
14
|
|
17
|
-
def __init__(self, queue:
|
15
|
+
def __init__(self, queue: ThrottledQueue) -> None:
|
18
16
|
"""
|
19
17
|
:param queue: The queue to put the partitions in.
|
20
|
-
:param
|
18
|
+
:param throttler: The throttler to use to throttle the partition generation.
|
21
19
|
"""
|
22
20
|
self._queue = queue
|
23
21
|
|
@@ -2,10 +2,9 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from queue import Queue
|
6
|
-
|
7
5
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
8
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.
|
6
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel
|
9
8
|
|
10
9
|
|
11
10
|
class PartitionReader:
|
@@ -13,7 +12,7 @@ class PartitionReader:
|
|
13
12
|
Generates records from a partition and puts them in a queue.
|
14
13
|
"""
|
15
14
|
|
16
|
-
def __init__(self, queue:
|
15
|
+
def __init__(self, queue: ThrottledQueue) -> None:
|
17
16
|
"""
|
18
17
|
:param queue: The queue to put the records in.
|
19
18
|
"""
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from queue import Queue
|
4
|
+
|
5
|
+
from airbyte_cdk.sources.concurrent_source.throttler import Throttler
|
6
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
|
7
|
+
|
8
|
+
|
9
|
+
class ThrottledQueue:
|
10
|
+
"""
|
11
|
+
A queue that throttles the number of items that can be added to it.
|
12
|
+
|
13
|
+
We throttle the queue using custom logic instead of relying on the queue's max size
|
14
|
+
because the main thread can continuously dequeue before submitting a future.
|
15
|
+
|
16
|
+
Since the main thread doesn't wait, it'll be able to remove items from the queue even if the tasks should be throttled,
|
17
|
+
so the tasks won't wait.
|
18
|
+
|
19
|
+
This class solves this issue by checking if we should throttle the queue before adding an item to it.
|
20
|
+
An example implementation of a throttler would check if the number of pending futures is greater than a certain threshold.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, queue: Queue[QueueItem], throttler: Throttler, timeout: float) -> None:
|
24
|
+
"""
|
25
|
+
:param queue: The queue to throttle
|
26
|
+
:param throttler: The throttler to use to throttle the queue
|
27
|
+
:param timeout: The timeout to use when getting items from the queue
|
28
|
+
"""
|
29
|
+
self._queue = queue
|
30
|
+
self._throttler = throttler
|
31
|
+
self._timeout = timeout
|
32
|
+
|
33
|
+
def put(self, item: QueueItem) -> None:
|
34
|
+
self._throttler.wait_and_acquire()
|
35
|
+
self._queue.put(item)
|
36
|
+
|
37
|
+
def get(self) -> QueueItem:
|
38
|
+
return self._queue.get(block=True, timeout=self._timeout)
|
39
|
+
|
40
|
+
def empty(self) -> bool:
|
41
|
+
return self._queue.empty()
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from abc import ABC, abstractmethod
|
6
6
|
from enum import Enum
|
7
|
-
from typing import TYPE_CHECKING, Any, List, MutableMapping,
|
7
|
+
from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
10
10
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
@@ -18,15 +18,6 @@ class AbstractStreamStateConverter(ABC):
|
|
18
18
|
START_KEY = "start"
|
19
19
|
END_KEY = "end"
|
20
20
|
|
21
|
-
def get_concurrent_stream_state(
|
22
|
-
self, cursor_field: Optional["CursorField"], state: MutableMapping[str, Any]
|
23
|
-
) -> Optional[MutableMapping[str, Any]]:
|
24
|
-
if not cursor_field:
|
25
|
-
return None
|
26
|
-
if self.is_state_message_compatible(state):
|
27
|
-
return self.deserialize(state)
|
28
|
-
return self.convert_from_sequential_state(cursor_field, state)
|
29
|
-
|
30
21
|
@abstractmethod
|
31
22
|
def deserialize(self, state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
|
32
23
|
"""
|
@@ -40,8 +31,11 @@ class AbstractStreamStateConverter(ABC):
|
|
40
31
|
|
41
32
|
@abstractmethod
|
42
33
|
def convert_from_sequential_state(
|
43
|
-
self,
|
44
|
-
|
34
|
+
self,
|
35
|
+
cursor_field: "CursorField",
|
36
|
+
stream_state: MutableMapping[str, Any],
|
37
|
+
start: Any,
|
38
|
+
) -> Tuple[Any, MutableMapping[str, Any]]:
|
45
39
|
"""
|
46
40
|
Convert the state message to the format required by the ConcurrentCursor.
|
47
41
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from abc import abstractmethod
|
6
6
|
from datetime import datetime, timedelta
|
7
|
-
from typing import Any, List, MutableMapping, Optional
|
7
|
+
from typing import Any, List, MutableMapping, Optional, Tuple
|
8
8
|
|
9
9
|
import pendulum
|
10
10
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
@@ -16,9 +16,6 @@ from pendulum.datetime import DateTime
|
|
16
16
|
|
17
17
|
|
18
18
|
class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
19
|
-
START_KEY = "start"
|
20
|
-
END_KEY = "end"
|
21
|
-
|
22
19
|
@property
|
23
20
|
@abstractmethod
|
24
21
|
def _zero_value(self) -> Any:
|
@@ -62,7 +59,7 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
62
59
|
for interval in sorted_intervals[1:]:
|
63
60
|
last_end_time = merged_intervals[-1][self.END_KEY]
|
64
61
|
current_start_time = interval[self.START_KEY]
|
65
|
-
if self.
|
62
|
+
if self._compare_intervals(last_end_time, current_start_time):
|
66
63
|
merged_end_time = max(last_end_time, interval[self.END_KEY])
|
67
64
|
merged_intervals[-1][self.END_KEY] = merged_end_time
|
68
65
|
else:
|
@@ -70,10 +67,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
70
67
|
|
71
68
|
return merged_intervals
|
72
69
|
|
73
|
-
def
|
70
|
+
def _compare_intervals(self, end_time: Any, start_time: Any) -> bool:
|
74
71
|
return bool(self.increment(end_time) >= start_time)
|
75
72
|
|
76
|
-
def convert_from_sequential_state(
|
73
|
+
def convert_from_sequential_state(
|
74
|
+
self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: datetime
|
75
|
+
) -> Tuple[datetime, MutableMapping[str, Any]]:
|
77
76
|
"""
|
78
77
|
Convert the state message to the format required by the ConcurrentCursor.
|
79
78
|
|
@@ -82,28 +81,35 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
82
81
|
"state_type": ConcurrencyCompatibleStateType.date_range.value,
|
83
82
|
"metadata": { … },
|
84
83
|
"slices": [
|
85
|
-
{
|
84
|
+
{"start": "2021-01-18T21:18:20.000+00:00", "end": "2021-01-18T21:18:20.000+00:00"},
|
85
|
+
]
|
86
86
|
}
|
87
87
|
"""
|
88
|
+
sync_start = self._get_sync_start(cursor_field, stream_state, start)
|
88
89
|
if self.is_state_message_compatible(stream_state):
|
89
|
-
return stream_state
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
},
|
98
|
-
]
|
99
|
-
else:
|
100
|
-
slices = []
|
101
|
-
return {
|
90
|
+
return sync_start, stream_state
|
91
|
+
|
92
|
+
# Create a slice to represent the records synced during prior syncs.
|
93
|
+
# The start and end are the same to avoid confusion as to whether the records for this slice
|
94
|
+
# were actually synced
|
95
|
+
slices = [{self.START_KEY: sync_start, self.END_KEY: sync_start}]
|
96
|
+
|
97
|
+
return sync_start, {
|
102
98
|
"state_type": ConcurrencyCompatibleStateType.date_range.value,
|
103
99
|
"slices": slices,
|
104
100
|
"legacy": stream_state,
|
105
101
|
}
|
106
102
|
|
103
|
+
def _get_sync_start(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: Optional[Any]) -> datetime:
|
104
|
+
sync_start = self.parse_timestamp(start) if start is not None else self.zero_value
|
105
|
+
prev_sync_low_water_mark = (
|
106
|
+
self.parse_timestamp(stream_state[cursor_field.cursor_field_key]) if cursor_field.cursor_field_key in stream_state else None
|
107
|
+
)
|
108
|
+
if prev_sync_low_water_mark and prev_sync_low_water_mark >= sync_start:
|
109
|
+
return prev_sync_low_water_mark
|
110
|
+
else:
|
111
|
+
return sync_start
|
112
|
+
|
107
113
|
def convert_to_sequential_state(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
|
108
114
|
"""
|
109
115
|
Convert the state message from the concurrency-compatible format to the stream's original format.
|
@@ -113,10 +119,9 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
113
119
|
"""
|
114
120
|
if self.is_state_message_compatible(stream_state):
|
115
121
|
legacy_state = stream_state.get("legacy", {})
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
|
122
|
+
latest_complete_time = self._get_latest_complete_time(stream_state.get("slices", []))
|
123
|
+
if latest_complete_time is not None:
|
124
|
+
legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
|
120
125
|
return legacy_state or {}
|
121
126
|
else:
|
122
127
|
return stream_state
|
@@ -125,11 +130,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
125
130
|
"""
|
126
131
|
Get the latest time before which all records have been processed.
|
127
132
|
"""
|
128
|
-
if slices:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
+
if not slices:
|
134
|
+
raise RuntimeError("Expected at least one slice but there were none. This is unexpected; please contact Support.")
|
135
|
+
|
136
|
+
merged_intervals = self.merge_intervals(slices)
|
137
|
+
first_interval = merged_intervals[0]
|
138
|
+
return first_interval[self.END_KEY]
|
133
139
|
|
134
140
|
|
135
141
|
class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter):
|