airbyte-cdk 0.58.8__py3-none-any.whl → 0.59.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +20 -21
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +4 -3
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +15 -18
- airbyte_cdk/sources/concurrent_source/throttler.py +25 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +29 -8
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +3 -5
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +3 -4
- airbyte_cdk/sources/streams/concurrent/partitions/throttled_queue.py +41 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +6 -12
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +36 -30
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/RECORD +23 -19
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +2 -2
- unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +4 -10
- unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +82 -12
- unit_tests/sources/streams/concurrent/test_cursor.py +20 -3
- unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +166 -268
- unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +2 -15
- unit_tests/sources/streams/concurrent/test_throttled_queue.py +65 -0
- unit_tests/sources/streams/concurrent/test_throttler.py +13 -0
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/top_level.txt +0 -0
@@ -43,9 +43,9 @@ class ConcurrentReadProcessor:
|
|
43
43
|
"""
|
44
44
|
self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from}
|
45
45
|
self._record_counter = {}
|
46
|
-
self.
|
46
|
+
self._streams_to_running_partitions: Dict[str, Set[Partition]] = {}
|
47
47
|
for stream in stream_instances_to_read_from:
|
48
|
-
self.
|
48
|
+
self._streams_to_running_partitions[stream.name] = set()
|
49
49
|
self._record_counter[stream.name] = 0
|
50
50
|
self._thread_pool_manager = thread_pool_manager
|
51
51
|
self._partition_enqueuer = partition_enqueuer
|
@@ -55,6 +55,7 @@ class ConcurrentReadProcessor:
|
|
55
55
|
self._slice_logger = slice_logger
|
56
56
|
self._message_repository = message_repository
|
57
57
|
self._partition_reader = partition_reader
|
58
|
+
self._streams_done: Set[str] = set()
|
58
59
|
|
59
60
|
def on_partition_generation_completed(self, sentinel: PartitionGenerationCompletedSentinel) -> Iterable[AirbyteMessage]:
|
60
61
|
"""
|
@@ -67,7 +68,8 @@ class ConcurrentReadProcessor:
|
|
67
68
|
self._streams_currently_generating_partitions.remove(sentinel.stream.name)
|
68
69
|
ret = []
|
69
70
|
# It is possible for the stream to already be done if no partitions were generated
|
70
|
-
|
71
|
+
# If the partition generation process was completed and there are no partitions left to process, the stream is done
|
72
|
+
if self._is_stream_done(stream_name) or len(self._streams_to_running_partitions[stream_name]) == 0:
|
71
73
|
ret.append(self._on_stream_is_done(stream_name))
|
72
74
|
if self._stream_instances_to_start_partition_generation:
|
73
75
|
ret.append(self.start_next_partition_generator())
|
@@ -81,7 +83,7 @@ class ConcurrentReadProcessor:
|
|
81
83
|
3. Submit the partition to the thread pool manager
|
82
84
|
"""
|
83
85
|
stream_name = partition.stream_name()
|
84
|
-
self.
|
86
|
+
self._streams_to_running_partitions[stream_name].add(partition)
|
85
87
|
if self._slice_logger.should_log_slice_message(self._logger):
|
86
88
|
self._message_repository.emit_message(self._slice_logger.create_slice_log_message(partition.to_slice()))
|
87
89
|
self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
|
@@ -95,8 +97,12 @@ class ConcurrentReadProcessor:
|
|
95
97
|
"""
|
96
98
|
partition = sentinel.partition
|
97
99
|
partition.close()
|
98
|
-
|
99
|
-
|
100
|
+
partitions_running = self._streams_to_running_partitions[partition.stream_name()]
|
101
|
+
if partition in partitions_running:
|
102
|
+
partitions_running.remove(partition)
|
103
|
+
# If all partitions were generated and this was the last one, the stream is done
|
104
|
+
if partition.stream_name() not in self._streams_currently_generating_partitions and len(partitions_running) == 0:
|
105
|
+
yield self._on_stream_is_done(partition.stream_name())
|
100
106
|
yield from self._message_repository.consume_queue()
|
101
107
|
|
102
108
|
def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
|
@@ -114,11 +120,10 @@ class ConcurrentReadProcessor:
|
|
114
120
|
message = stream_data_to_airbyte_message(record.stream_name, record.data)
|
115
121
|
stream = self._stream_name_to_instance[record.stream_name]
|
116
122
|
|
117
|
-
if self._record_counter[stream.name] == 0:
|
118
|
-
self._logger.info(f"Marking stream {stream.name} as RUNNING")
|
119
|
-
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
|
120
|
-
|
121
123
|
if message.type == MessageType.RECORD:
|
124
|
+
if self._record_counter[stream.name] == 0:
|
125
|
+
self._logger.info(f"Marking stream {stream.name} as RUNNING")
|
126
|
+
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
|
122
127
|
self._record_counter[stream.name] += 1
|
123
128
|
yield message
|
124
129
|
yield from self._message_repository.consume_queue()
|
@@ -161,30 +166,24 @@ class ConcurrentReadProcessor:
|
|
161
166
|
2. There are no more streams to read from
|
162
167
|
3. All partitions for all streams are closed
|
163
168
|
"""
|
164
|
-
return (
|
165
|
-
not self._streams_currently_generating_partitions
|
166
|
-
and not self._stream_instances_to_start_partition_generation
|
167
|
-
and all([all(p.is_closed() for p in partitions) for partitions in self._streams_to_partitions.values()])
|
168
|
-
)
|
169
|
+
return all([self._is_stream_done(stream_name) for stream_name in self._stream_name_to_instance.keys()])
|
169
170
|
|
170
171
|
def _is_stream_done(self, stream_name: str) -> bool:
|
171
|
-
return
|
172
|
-
all([p.is_closed() for p in self._streams_to_partitions[stream_name]])
|
173
|
-
and stream_name not in self._streams_currently_generating_partitions
|
174
|
-
)
|
172
|
+
return stream_name in self._streams_done
|
175
173
|
|
176
174
|
def _on_stream_is_done(self, stream_name: str) -> AirbyteMessage:
|
177
175
|
self._logger.info(f"Read {self._record_counter[stream_name]} records from {stream_name} stream")
|
178
176
|
self._logger.info(f"Marking stream {stream_name} as STOPPED")
|
179
177
|
stream = self._stream_name_to_instance[stream_name]
|
180
178
|
self._logger.info(f"Finished syncing {stream.name}")
|
179
|
+
self._streams_done.add(stream_name)
|
181
180
|
return stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.COMPLETE)
|
182
181
|
|
183
182
|
def _stop_streams(self) -> Iterable[AirbyteMessage]:
|
184
183
|
self._thread_pool_manager.shutdown()
|
185
|
-
for stream_name
|
184
|
+
for stream_name in self._streams_to_running_partitions.keys():
|
186
185
|
stream = self._stream_name_to_instance[stream_name]
|
187
|
-
if not
|
186
|
+
if not self._is_stream_done(stream_name):
|
188
187
|
self._logger.info(f"Marking stream {stream.name} as STOPPED")
|
189
188
|
self._logger.info(f"Finished syncing {stream.name}")
|
190
189
|
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.INCOMPLETE)
|
@@ -16,6 +16,7 @@ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionE
|
|
16
16
|
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
17
17
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
18
18
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
19
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
|
19
20
|
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
|
20
21
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
21
22
|
|
@@ -82,7 +83,7 @@ class ConcurrentSource:
|
|
82
83
|
if not stream_instances_to_read_from:
|
83
84
|
return
|
84
85
|
|
85
|
-
queue:
|
86
|
+
queue: ThrottledQueue = ThrottledQueue(Queue(), self._threadpool.get_throttler(), self._timeout_seconds)
|
86
87
|
concurrent_stream_processor = ConcurrentReadProcessor(
|
87
88
|
stream_instances_to_read_from,
|
88
89
|
PartitionEnqueuer(queue),
|
@@ -112,10 +113,10 @@ class ConcurrentSource:
|
|
112
113
|
|
113
114
|
def _consume_from_queue(
|
114
115
|
self,
|
115
|
-
queue:
|
116
|
+
queue: ThrottledQueue,
|
116
117
|
concurrent_stream_processor: ConcurrentReadProcessor,
|
117
118
|
) -> Iterable[AirbyteMessage]:
|
118
|
-
while airbyte_message_or_record_or_exception := queue.get(
|
119
|
+
while airbyte_message_or_record_or_exception := queue.get():
|
119
120
|
yield from self._handle_item(
|
120
121
|
airbyte_message_or_record_or_exception,
|
121
122
|
concurrent_stream_processor,
|
@@ -2,10 +2,11 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
import time
|
6
5
|
from concurrent.futures import Future, ThreadPoolExecutor
|
7
6
|
from typing import Any, Callable, List
|
8
7
|
|
8
|
+
from airbyte_cdk.sources.concurrent_source.throttler import Throttler
|
9
|
+
|
9
10
|
|
10
11
|
class ThreadPoolManager:
|
11
12
|
"""
|
@@ -19,8 +20,8 @@ class ThreadPoolManager:
|
|
19
20
|
self,
|
20
21
|
threadpool: ThreadPoolExecutor,
|
21
22
|
logger: logging.Logger,
|
22
|
-
max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
|
23
23
|
sleep_time: float = DEFAULT_SLEEP_TIME,
|
24
|
+
max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
|
24
25
|
):
|
25
26
|
"""
|
26
27
|
:param threadpool: The threadpool to use
|
@@ -31,23 +32,17 @@ class ThreadPoolManager:
|
|
31
32
|
self._threadpool = threadpool
|
32
33
|
self._logger = logger
|
33
34
|
self._max_concurrent_tasks = max_concurrent_tasks
|
34
|
-
self._sleep_time = sleep_time
|
35
35
|
self._futures: List[Future[Any]] = []
|
36
|
+
self._throttler = Throttler(self._futures, sleep_time, max_concurrent_tasks)
|
37
|
+
|
38
|
+
def get_throttler(self) -> Throttler:
|
39
|
+
return self._throttler
|
36
40
|
|
37
41
|
def submit(self, function: Callable[..., Any], *args: Any) -> None:
|
38
|
-
# Submit a task to the threadpool,
|
39
|
-
self.
|
42
|
+
# Submit a task to the threadpool, removing completed tasks if there are too many tasks in self._futures.
|
43
|
+
self._prune_futures(self._futures)
|
40
44
|
self._futures.append(self._threadpool.submit(function, *args))
|
41
45
|
|
42
|
-
def _wait_while_too_many_pending_futures(self, futures: List[Future[Any]]) -> None:
|
43
|
-
# Wait until the number of pending tasks is < self._max_concurrent_tasks
|
44
|
-
while True:
|
45
|
-
self._prune_futures(futures)
|
46
|
-
if len(futures) < self._max_concurrent_tasks:
|
47
|
-
break
|
48
|
-
self._logger.info("Main thread is sleeping because the task queue is full...")
|
49
|
-
time.sleep(self._sleep_time)
|
50
|
-
|
51
46
|
def _prune_futures(self, futures: List[Future[Any]]) -> None:
|
52
47
|
"""
|
53
48
|
Take a list in input and remove the futures that are completed. If a future has an exception, it'll raise and kill the stream
|
@@ -60,12 +55,14 @@ class ThreadPoolManager:
|
|
60
55
|
|
61
56
|
for index in reversed(range(len(futures))):
|
62
57
|
future = futures[index]
|
63
|
-
optional_exception = future.exception()
|
64
|
-
if optional_exception:
|
65
|
-
exception = RuntimeError(f"Failed reading with error: {optional_exception}")
|
66
|
-
self._stop_and_raise_exception(exception)
|
67
58
|
|
68
59
|
if future.done():
|
60
|
+
# Only call future.exception() if the future is known to be done because it will block until the future is done.
|
61
|
+
# See https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Future.exception
|
62
|
+
optional_exception = future.exception()
|
63
|
+
if optional_exception:
|
64
|
+
exception = RuntimeError(f"Failed reading with error: {optional_exception}")
|
65
|
+
self._stop_and_raise_exception(exception)
|
69
66
|
futures.pop(index)
|
70
67
|
|
71
68
|
def shutdown(self) -> None:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
import time
|
4
|
+
from concurrent.futures import Future
|
5
|
+
from typing import Any, List
|
6
|
+
|
7
|
+
|
8
|
+
class Throttler:
|
9
|
+
"""
|
10
|
+
A throttler that waits until the number of concurrent tasks is below a certain threshold.
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __init__(self, futures_list: List[Future[Any]], sleep_time: float, max_concurrent_tasks: int):
|
14
|
+
"""
|
15
|
+
:param futures_list: The list of futures to monitor
|
16
|
+
:param sleep_time: How long to sleep if there are too many pending tasks
|
17
|
+
:param max_concurrent_tasks: The maximum number of tasks that can be pending at the same time
|
18
|
+
"""
|
19
|
+
self._futures_list = futures_list
|
20
|
+
self._sleep_time = sleep_time
|
21
|
+
self._max_concurrent_tasks = max_concurrent_tasks
|
22
|
+
|
23
|
+
def wait_and_acquire(self) -> None:
|
24
|
+
while len(self._futures_list) >= self._max_concurrent_tasks:
|
25
|
+
time.sleep(self._sleep_time)
|
@@ -3,7 +3,8 @@
|
|
3
3
|
#
|
4
4
|
import functools
|
5
5
|
from abc import ABC, abstractmethod
|
6
|
-
from
|
6
|
+
from datetime import datetime
|
7
|
+
from typing import Any, List, Mapping, MutableMapping, Optional, Protocol, Tuple
|
7
8
|
|
8
9
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
9
10
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -36,6 +37,11 @@ class CursorField:
|
|
36
37
|
|
37
38
|
|
38
39
|
class Cursor(ABC):
|
40
|
+
@property
|
41
|
+
@abstractmethod
|
42
|
+
def state(self) -> MutableMapping[str, Any]:
|
43
|
+
...
|
44
|
+
|
39
45
|
@abstractmethod
|
40
46
|
def observe(self, record: Record) -> None:
|
41
47
|
"""
|
@@ -52,6 +58,10 @@ class Cursor(ABC):
|
|
52
58
|
|
53
59
|
|
54
60
|
class NoopCursor(Cursor):
|
61
|
+
@property
|
62
|
+
def state(self) -> MutableMapping[str, Any]:
|
63
|
+
return {}
|
64
|
+
|
55
65
|
def observe(self, record: Record) -> None:
|
56
66
|
pass
|
57
67
|
|
@@ -73,6 +83,7 @@ class ConcurrentCursor(Cursor):
|
|
73
83
|
connector_state_converter: AbstractStreamStateConverter,
|
74
84
|
cursor_field: CursorField,
|
75
85
|
slice_boundary_fields: Optional[Tuple[str, str]],
|
86
|
+
start: Optional[Any],
|
76
87
|
) -> None:
|
77
88
|
self._stream_name = stream_name
|
78
89
|
self._stream_namespace = stream_namespace
|
@@ -82,9 +93,19 @@ class ConcurrentCursor(Cursor):
|
|
82
93
|
self._cursor_field = cursor_field
|
83
94
|
# To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379
|
84
95
|
self._slice_boundary_fields = slice_boundary_fields if slice_boundary_fields else tuple()
|
96
|
+
self._start = start
|
85
97
|
self._most_recent_record: Optional[Record] = None
|
86
98
|
self._has_closed_at_least_one_slice = False
|
87
|
-
self.
|
99
|
+
self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
|
100
|
+
|
101
|
+
@property
|
102
|
+
def state(self) -> MutableMapping[str, Any]:
|
103
|
+
return self._concurrent_state
|
104
|
+
|
105
|
+
def _get_concurrent_state(self, state: MutableMapping[str, Any]) -> Tuple[datetime, MutableMapping[str, Any]]:
|
106
|
+
if self._connector_state_converter.is_state_message_compatible(state):
|
107
|
+
return self._start or self._connector_state_converter.zero_value, self._connector_state_converter.deserialize(state)
|
108
|
+
return self._connector_state_converter.convert_from_sequential_state(self._cursor_field, state, self._start)
|
88
109
|
|
89
110
|
def observe(self, record: Record) -> None:
|
90
111
|
if self._slice_boundary_fields:
|
@@ -102,7 +123,7 @@ class ConcurrentCursor(Cursor):
|
|
102
123
|
def close_partition(self, partition: Partition) -> None:
|
103
124
|
slice_count_before = len(self.state.get("slices", []))
|
104
125
|
self._add_slice_to_state(partition)
|
105
|
-
if slice_count_before < len(self.state["slices"]):
|
126
|
+
if slice_count_before < len(self.state["slices"]): # only emit if at least one slice has been processed
|
106
127
|
self._merge_partitions()
|
107
128
|
self._emit_state_message()
|
108
129
|
self._has_closed_at_least_one_slice = True
|
@@ -110,7 +131,9 @@ class ConcurrentCursor(Cursor):
|
|
110
131
|
def _add_slice_to_state(self, partition: Partition) -> None:
|
111
132
|
if self._slice_boundary_fields:
|
112
133
|
if "slices" not in self.state:
|
113
|
-
|
134
|
+
raise RuntimeError(
|
135
|
+
f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
|
136
|
+
)
|
114
137
|
self.state["slices"].append(
|
115
138
|
{
|
116
139
|
"start": self._extract_from_slice(partition, self._slice_boundary_fields[self._START_BOUNDARY]),
|
@@ -126,10 +149,8 @@ class ConcurrentCursor(Cursor):
|
|
126
149
|
|
127
150
|
self.state["slices"].append(
|
128
151
|
{
|
129
|
-
|
130
|
-
|
131
|
-
"start": self._connector_state_converter.zero_value,
|
132
|
-
"end": self._extract_cursor_value(self._most_recent_record),
|
152
|
+
self._connector_state_converter.START_KEY: self.start,
|
153
|
+
self._connector_state_converter.END_KEY: self._extract_cursor_value(self._most_recent_record),
|
133
154
|
}
|
134
155
|
)
|
135
156
|
|
@@ -2,11 +2,9 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from queue import Queue
|
6
|
-
|
7
5
|
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
|
8
6
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
9
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
|
10
8
|
|
11
9
|
|
12
10
|
class PartitionEnqueuer:
|
@@ -14,10 +12,10 @@ class PartitionEnqueuer:
|
|
14
12
|
Generates partitions from a partition generator and puts them in a queue.
|
15
13
|
"""
|
16
14
|
|
17
|
-
def __init__(self, queue:
|
15
|
+
def __init__(self, queue: ThrottledQueue) -> None:
|
18
16
|
"""
|
19
17
|
:param queue: The queue to put the partitions in.
|
20
|
-
:param
|
18
|
+
:param throttler: The throttler to use to throttle the partition generation.
|
21
19
|
"""
|
22
20
|
self._queue = queue
|
23
21
|
|
@@ -2,10 +2,9 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from queue import Queue
|
6
|
-
|
7
5
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
8
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.
|
6
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel
|
9
8
|
|
10
9
|
|
11
10
|
class PartitionReader:
|
@@ -13,7 +12,7 @@ class PartitionReader:
|
|
13
12
|
Generates records from a partition and puts them in a queue.
|
14
13
|
"""
|
15
14
|
|
16
|
-
def __init__(self, queue:
|
15
|
+
def __init__(self, queue: ThrottledQueue) -> None:
|
17
16
|
"""
|
18
17
|
:param queue: The queue to put the records in.
|
19
18
|
"""
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from queue import Queue
|
4
|
+
|
5
|
+
from airbyte_cdk.sources.concurrent_source.throttler import Throttler
|
6
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
|
7
|
+
|
8
|
+
|
9
|
+
class ThrottledQueue:
|
10
|
+
"""
|
11
|
+
A queue that throttles the number of items that can be added to it.
|
12
|
+
|
13
|
+
We throttle the queue using custom logic instead of relying on the queue's max size
|
14
|
+
because the main thread can continuously dequeue before submitting a future.
|
15
|
+
|
16
|
+
Since the main thread doesn't wait, it'll be able to remove items from the queue even if the tasks should be throttled,
|
17
|
+
so the tasks won't wait.
|
18
|
+
|
19
|
+
This class solves this issue by checking if we should throttle the queue before adding an item to it.
|
20
|
+
An example implementation of a throttler would check if the number of pending futures is greater than a certain threshold.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, queue: Queue[QueueItem], throttler: Throttler, timeout: float) -> None:
|
24
|
+
"""
|
25
|
+
:param queue: The queue to throttle
|
26
|
+
:param throttler: The throttler to use to throttle the queue
|
27
|
+
:param timeout: The timeout to use when getting items from the queue
|
28
|
+
"""
|
29
|
+
self._queue = queue
|
30
|
+
self._throttler = throttler
|
31
|
+
self._timeout = timeout
|
32
|
+
|
33
|
+
def put(self, item: QueueItem) -> None:
|
34
|
+
self._throttler.wait_and_acquire()
|
35
|
+
self._queue.put(item)
|
36
|
+
|
37
|
+
def get(self) -> QueueItem:
|
38
|
+
return self._queue.get(block=True, timeout=self._timeout)
|
39
|
+
|
40
|
+
def empty(self) -> bool:
|
41
|
+
return self._queue.empty()
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from abc import ABC, abstractmethod
|
6
6
|
from enum import Enum
|
7
|
-
from typing import TYPE_CHECKING, Any, List, MutableMapping,
|
7
|
+
from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
10
10
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
@@ -18,15 +18,6 @@ class AbstractStreamStateConverter(ABC):
|
|
18
18
|
START_KEY = "start"
|
19
19
|
END_KEY = "end"
|
20
20
|
|
21
|
-
def get_concurrent_stream_state(
|
22
|
-
self, cursor_field: Optional["CursorField"], state: MutableMapping[str, Any]
|
23
|
-
) -> Optional[MutableMapping[str, Any]]:
|
24
|
-
if not cursor_field:
|
25
|
-
return None
|
26
|
-
if self.is_state_message_compatible(state):
|
27
|
-
return self.deserialize(state)
|
28
|
-
return self.convert_from_sequential_state(cursor_field, state)
|
29
|
-
|
30
21
|
@abstractmethod
|
31
22
|
def deserialize(self, state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
|
32
23
|
"""
|
@@ -40,8 +31,11 @@ class AbstractStreamStateConverter(ABC):
|
|
40
31
|
|
41
32
|
@abstractmethod
|
42
33
|
def convert_from_sequential_state(
|
43
|
-
self,
|
44
|
-
|
34
|
+
self,
|
35
|
+
cursor_field: "CursorField",
|
36
|
+
stream_state: MutableMapping[str, Any],
|
37
|
+
start: Any,
|
38
|
+
) -> Tuple[Any, MutableMapping[str, Any]]:
|
45
39
|
"""
|
46
40
|
Convert the state message to the format required by the ConcurrentCursor.
|
47
41
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from abc import abstractmethod
|
6
6
|
from datetime import datetime, timedelta
|
7
|
-
from typing import Any, List, MutableMapping, Optional
|
7
|
+
from typing import Any, List, MutableMapping, Optional, Tuple
|
8
8
|
|
9
9
|
import pendulum
|
10
10
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
@@ -16,9 +16,6 @@ from pendulum.datetime import DateTime
|
|
16
16
|
|
17
17
|
|
18
18
|
class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
19
|
-
START_KEY = "start"
|
20
|
-
END_KEY = "end"
|
21
|
-
|
22
19
|
@property
|
23
20
|
@abstractmethod
|
24
21
|
def _zero_value(self) -> Any:
|
@@ -62,7 +59,7 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
62
59
|
for interval in sorted_intervals[1:]:
|
63
60
|
last_end_time = merged_intervals[-1][self.END_KEY]
|
64
61
|
current_start_time = interval[self.START_KEY]
|
65
|
-
if self.
|
62
|
+
if self._compare_intervals(last_end_time, current_start_time):
|
66
63
|
merged_end_time = max(last_end_time, interval[self.END_KEY])
|
67
64
|
merged_intervals[-1][self.END_KEY] = merged_end_time
|
68
65
|
else:
|
@@ -70,10 +67,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
70
67
|
|
71
68
|
return merged_intervals
|
72
69
|
|
73
|
-
def
|
70
|
+
def _compare_intervals(self, end_time: Any, start_time: Any) -> bool:
|
74
71
|
return bool(self.increment(end_time) >= start_time)
|
75
72
|
|
76
|
-
def convert_from_sequential_state(
|
73
|
+
def convert_from_sequential_state(
|
74
|
+
self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: datetime
|
75
|
+
) -> Tuple[datetime, MutableMapping[str, Any]]:
|
77
76
|
"""
|
78
77
|
Convert the state message to the format required by the ConcurrentCursor.
|
79
78
|
|
@@ -82,28 +81,35 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
82
81
|
"state_type": ConcurrencyCompatibleStateType.date_range.value,
|
83
82
|
"metadata": { … },
|
84
83
|
"slices": [
|
85
|
-
{
|
84
|
+
{"start": "2021-01-18T21:18:20.000+00:00", "end": "2021-01-18T21:18:20.000+00:00"},
|
85
|
+
]
|
86
86
|
}
|
87
87
|
"""
|
88
|
+
sync_start = self._get_sync_start(cursor_field, stream_state, start)
|
88
89
|
if self.is_state_message_compatible(stream_state):
|
89
|
-
return stream_state
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
},
|
98
|
-
]
|
99
|
-
else:
|
100
|
-
slices = []
|
101
|
-
return {
|
90
|
+
return sync_start, stream_state
|
91
|
+
|
92
|
+
# Create a slice to represent the records synced during prior syncs.
|
93
|
+
# The start and end are the same to avoid confusion as to whether the records for this slice
|
94
|
+
# were actually synced
|
95
|
+
slices = [{self.START_KEY: sync_start, self.END_KEY: sync_start}]
|
96
|
+
|
97
|
+
return sync_start, {
|
102
98
|
"state_type": ConcurrencyCompatibleStateType.date_range.value,
|
103
99
|
"slices": slices,
|
104
100
|
"legacy": stream_state,
|
105
101
|
}
|
106
102
|
|
103
|
+
def _get_sync_start(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: Optional[Any]) -> datetime:
|
104
|
+
sync_start = self.parse_timestamp(start) if start is not None else self.zero_value
|
105
|
+
prev_sync_low_water_mark = (
|
106
|
+
self.parse_timestamp(stream_state[cursor_field.cursor_field_key]) if cursor_field.cursor_field_key in stream_state else None
|
107
|
+
)
|
108
|
+
if prev_sync_low_water_mark and prev_sync_low_water_mark >= sync_start:
|
109
|
+
return prev_sync_low_water_mark
|
110
|
+
else:
|
111
|
+
return sync_start
|
112
|
+
|
107
113
|
def convert_to_sequential_state(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
|
108
114
|
"""
|
109
115
|
Convert the state message from the concurrency-compatible format to the stream's original format.
|
@@ -113,10 +119,9 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
113
119
|
"""
|
114
120
|
if self.is_state_message_compatible(stream_state):
|
115
121
|
legacy_state = stream_state.get("legacy", {})
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
|
122
|
+
latest_complete_time = self._get_latest_complete_time(stream_state.get("slices", []))
|
123
|
+
if latest_complete_time is not None:
|
124
|
+
legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
|
120
125
|
return legacy_state or {}
|
121
126
|
else:
|
122
127
|
return stream_state
|
@@ -125,11 +130,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
125
130
|
"""
|
126
131
|
Get the latest time before which all records have been processed.
|
127
132
|
"""
|
128
|
-
if slices:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
+
if not slices:
|
134
|
+
raise RuntimeError("Expected at least one slice but there were none. This is unexpected; please contact Support.")
|
135
|
+
|
136
|
+
merged_intervals = self.merge_intervals(slices)
|
137
|
+
first_interval = merged_intervals[0]
|
138
|
+
return first_interval[self.END_KEY]
|
133
139
|
|
134
140
|
|
135
141
|
class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter):
|