airbyte-cdk 0.58.8__py3-none-any.whl → 0.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +20 -21
  2. airbyte_cdk/sources/concurrent_source/concurrent_source.py +4 -3
  3. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +15 -18
  4. airbyte_cdk/sources/concurrent_source/throttler.py +25 -0
  5. airbyte_cdk/sources/streams/concurrent/cursor.py +29 -8
  6. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +3 -5
  7. airbyte_cdk/sources/streams/concurrent/partition_reader.py +3 -4
  8. airbyte_cdk/sources/streams/concurrent/partitions/throttled_queue.py +41 -0
  9. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +6 -12
  10. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +36 -30
  11. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/METADATA +1 -1
  12. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/RECORD +23 -19
  13. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +2 -2
  14. unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +4 -10
  15. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +82 -12
  16. unit_tests/sources/streams/concurrent/test_cursor.py +20 -3
  17. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +166 -268
  18. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +2 -15
  19. unit_tests/sources/streams/concurrent/test_throttled_queue.py +65 -0
  20. unit_tests/sources/streams/concurrent/test_throttler.py +13 -0
  21. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/WHEEL +0 -0
  23. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/top_level.txt +0 -0
@@ -43,9 +43,9 @@ class ConcurrentReadProcessor:
43
43
  """
44
44
  self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from}
45
45
  self._record_counter = {}
46
- self._streams_to_partitions: Dict[str, Set[Partition]] = {}
46
+ self._streams_to_running_partitions: Dict[str, Set[Partition]] = {}
47
47
  for stream in stream_instances_to_read_from:
48
- self._streams_to_partitions[stream.name] = set()
48
+ self._streams_to_running_partitions[stream.name] = set()
49
49
  self._record_counter[stream.name] = 0
50
50
  self._thread_pool_manager = thread_pool_manager
51
51
  self._partition_enqueuer = partition_enqueuer
@@ -55,6 +55,7 @@ class ConcurrentReadProcessor:
55
55
  self._slice_logger = slice_logger
56
56
  self._message_repository = message_repository
57
57
  self._partition_reader = partition_reader
58
+ self._streams_done: Set[str] = set()
58
59
 
59
60
  def on_partition_generation_completed(self, sentinel: PartitionGenerationCompletedSentinel) -> Iterable[AirbyteMessage]:
60
61
  """
@@ -67,7 +68,8 @@ class ConcurrentReadProcessor:
67
68
  self._streams_currently_generating_partitions.remove(sentinel.stream.name)
68
69
  ret = []
69
70
  # It is possible for the stream to already be done if no partitions were generated
70
- if self._is_stream_done(stream_name):
71
+ # If the partition generation process was completed and there are no partitions left to process, the stream is done
72
+ if self._is_stream_done(stream_name) or len(self._streams_to_running_partitions[stream_name]) == 0:
71
73
  ret.append(self._on_stream_is_done(stream_name))
72
74
  if self._stream_instances_to_start_partition_generation:
73
75
  ret.append(self.start_next_partition_generator())
@@ -81,7 +83,7 @@ class ConcurrentReadProcessor:
81
83
  3. Submit the partition to the thread pool manager
82
84
  """
83
85
  stream_name = partition.stream_name()
84
- self._streams_to_partitions[stream_name].add(partition)
86
+ self._streams_to_running_partitions[stream_name].add(partition)
85
87
  if self._slice_logger.should_log_slice_message(self._logger):
86
88
  self._message_repository.emit_message(self._slice_logger.create_slice_log_message(partition.to_slice()))
87
89
  self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
@@ -95,8 +97,12 @@ class ConcurrentReadProcessor:
95
97
  """
96
98
  partition = sentinel.partition
97
99
  partition.close()
98
- if self._is_stream_done(partition.stream_name()):
99
- yield self._on_stream_is_done(partition.stream_name())
100
+ partitions_running = self._streams_to_running_partitions[partition.stream_name()]
101
+ if partition in partitions_running:
102
+ partitions_running.remove(partition)
103
+ # If all partitions were generated and this was the last one, the stream is done
104
+ if partition.stream_name() not in self._streams_currently_generating_partitions and len(partitions_running) == 0:
105
+ yield self._on_stream_is_done(partition.stream_name())
100
106
  yield from self._message_repository.consume_queue()
101
107
 
102
108
  def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
@@ -114,11 +120,10 @@ class ConcurrentReadProcessor:
114
120
  message = stream_data_to_airbyte_message(record.stream_name, record.data)
115
121
  stream = self._stream_name_to_instance[record.stream_name]
116
122
 
117
- if self._record_counter[stream.name] == 0:
118
- self._logger.info(f"Marking stream {stream.name} as RUNNING")
119
- yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
120
-
121
123
  if message.type == MessageType.RECORD:
124
+ if self._record_counter[stream.name] == 0:
125
+ self._logger.info(f"Marking stream {stream.name} as RUNNING")
126
+ yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
122
127
  self._record_counter[stream.name] += 1
123
128
  yield message
124
129
  yield from self._message_repository.consume_queue()
@@ -161,30 +166,24 @@ class ConcurrentReadProcessor:
161
166
  2. There are no more streams to read from
162
167
  3. All partitions for all streams are closed
163
168
  """
164
- return (
165
- not self._streams_currently_generating_partitions
166
- and not self._stream_instances_to_start_partition_generation
167
- and all([all(p.is_closed() for p in partitions) for partitions in self._streams_to_partitions.values()])
168
- )
169
+ return all([self._is_stream_done(stream_name) for stream_name in self._stream_name_to_instance.keys()])
169
170
 
170
171
  def _is_stream_done(self, stream_name: str) -> bool:
171
- return (
172
- all([p.is_closed() for p in self._streams_to_partitions[stream_name]])
173
- and stream_name not in self._streams_currently_generating_partitions
174
- )
172
+ return stream_name in self._streams_done
175
173
 
176
174
  def _on_stream_is_done(self, stream_name: str) -> AirbyteMessage:
177
175
  self._logger.info(f"Read {self._record_counter[stream_name]} records from {stream_name} stream")
178
176
  self._logger.info(f"Marking stream {stream_name} as STOPPED")
179
177
  stream = self._stream_name_to_instance[stream_name]
180
178
  self._logger.info(f"Finished syncing {stream.name}")
179
+ self._streams_done.add(stream_name)
181
180
  return stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.COMPLETE)
182
181
 
183
182
  def _stop_streams(self) -> Iterable[AirbyteMessage]:
184
183
  self._thread_pool_manager.shutdown()
185
- for stream_name, partitions in self._streams_to_partitions.items():
184
+ for stream_name in self._streams_to_running_partitions.keys():
186
185
  stream = self._stream_name_to_instance[stream_name]
187
- if not all([p.is_closed() for p in partitions]):
186
+ if not self._is_stream_done(stream_name):
188
187
  self._logger.info(f"Marking stream {stream.name} as STOPPED")
189
188
  self._logger.info(f"Finished syncing {stream.name}")
190
189
  yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.INCOMPLETE)
@@ -16,6 +16,7 @@ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionE
16
16
  from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
17
17
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
18
18
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
19
+ from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
19
20
  from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
20
21
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
21
22
 
@@ -82,7 +83,7 @@ class ConcurrentSource:
82
83
  if not stream_instances_to_read_from:
83
84
  return
84
85
 
85
- queue: Queue[QueueItem] = Queue()
86
+ queue: ThrottledQueue = ThrottledQueue(Queue(), self._threadpool.get_throttler(), self._timeout_seconds)
86
87
  concurrent_stream_processor = ConcurrentReadProcessor(
87
88
  stream_instances_to_read_from,
88
89
  PartitionEnqueuer(queue),
@@ -112,10 +113,10 @@ class ConcurrentSource:
112
113
 
113
114
  def _consume_from_queue(
114
115
  self,
115
- queue: Queue[QueueItem],
116
+ queue: ThrottledQueue,
116
117
  concurrent_stream_processor: ConcurrentReadProcessor,
117
118
  ) -> Iterable[AirbyteMessage]:
118
- while airbyte_message_or_record_or_exception := queue.get(block=True, timeout=self._timeout_seconds):
119
+ while airbyte_message_or_record_or_exception := queue.get():
119
120
  yield from self._handle_item(
120
121
  airbyte_message_or_record_or_exception,
121
122
  concurrent_stream_processor,
@@ -2,10 +2,11 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
- import time
6
5
  from concurrent.futures import Future, ThreadPoolExecutor
7
6
  from typing import Any, Callable, List
8
7
 
8
+ from airbyte_cdk.sources.concurrent_source.throttler import Throttler
9
+
9
10
 
10
11
  class ThreadPoolManager:
11
12
  """
@@ -19,8 +20,8 @@ class ThreadPoolManager:
19
20
  self,
20
21
  threadpool: ThreadPoolExecutor,
21
22
  logger: logging.Logger,
22
- max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
23
23
  sleep_time: float = DEFAULT_SLEEP_TIME,
24
+ max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
24
25
  ):
25
26
  """
26
27
  :param threadpool: The threadpool to use
@@ -31,23 +32,17 @@ class ThreadPoolManager:
31
32
  self._threadpool = threadpool
32
33
  self._logger = logger
33
34
  self._max_concurrent_tasks = max_concurrent_tasks
34
- self._sleep_time = sleep_time
35
35
  self._futures: List[Future[Any]] = []
36
+ self._throttler = Throttler(self._futures, sleep_time, max_concurrent_tasks)
37
+
38
+ def get_throttler(self) -> Throttler:
39
+ return self._throttler
36
40
 
37
41
  def submit(self, function: Callable[..., Any], *args: Any) -> None:
38
- # Submit a task to the threadpool, waiting if there are too many pending tasks
39
- self._wait_while_too_many_pending_futures(self._futures)
42
+ # Submit a task to the threadpool, removing completed tasks if there are too many tasks in self._futures.
43
+ self._prune_futures(self._futures)
40
44
  self._futures.append(self._threadpool.submit(function, *args))
41
45
 
42
- def _wait_while_too_many_pending_futures(self, futures: List[Future[Any]]) -> None:
43
- # Wait until the number of pending tasks is < self._max_concurrent_tasks
44
- while True:
45
- self._prune_futures(futures)
46
- if len(futures) < self._max_concurrent_tasks:
47
- break
48
- self._logger.info("Main thread is sleeping because the task queue is full...")
49
- time.sleep(self._sleep_time)
50
-
51
46
  def _prune_futures(self, futures: List[Future[Any]]) -> None:
52
47
  """
53
48
  Take a list in input and remove the futures that are completed. If a future has an exception, it'll raise and kill the stream
@@ -60,12 +55,14 @@ class ThreadPoolManager:
60
55
 
61
56
  for index in reversed(range(len(futures))):
62
57
  future = futures[index]
63
- optional_exception = future.exception()
64
- if optional_exception:
65
- exception = RuntimeError(f"Failed reading with error: {optional_exception}")
66
- self._stop_and_raise_exception(exception)
67
58
 
68
59
  if future.done():
60
+ # Only call future.exception() if the future is known to be done because it will block until the future is done.
61
+ # See https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Future.exception
62
+ optional_exception = future.exception()
63
+ if optional_exception:
64
+ exception = RuntimeError(f"Failed reading with error: {optional_exception}")
65
+ self._stop_and_raise_exception(exception)
69
66
  futures.pop(index)
70
67
 
71
68
  def shutdown(self) -> None:
@@ -0,0 +1,25 @@
1
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+
3
+ import time
4
+ from concurrent.futures import Future
5
+ from typing import Any, List
6
+
7
+
8
+ class Throttler:
9
+ """
10
+ A throttler that waits until the number of concurrent tasks is below a certain threshold.
11
+ """
12
+
13
+ def __init__(self, futures_list: List[Future[Any]], sleep_time: float, max_concurrent_tasks: int):
14
+ """
15
+ :param futures_list: The list of futures to monitor
16
+ :param sleep_time: How long to sleep if there are too many pending tasks
17
+ :param max_concurrent_tasks: The maximum number of tasks that can be pending at the same time
18
+ """
19
+ self._futures_list = futures_list
20
+ self._sleep_time = sleep_time
21
+ self._max_concurrent_tasks = max_concurrent_tasks
22
+
23
+ def wait_and_acquire(self) -> None:
24
+ while len(self._futures_list) >= self._max_concurrent_tasks:
25
+ time.sleep(self._sleep_time)
@@ -3,7 +3,8 @@
3
3
  #
4
4
  import functools
5
5
  from abc import ABC, abstractmethod
6
- from typing import Any, List, Mapping, Optional, Protocol, Tuple
6
+ from datetime import datetime
7
+ from typing import Any, List, Mapping, MutableMapping, Optional, Protocol, Tuple
7
8
 
8
9
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
9
10
  from airbyte_cdk.sources.message import MessageRepository
@@ -36,6 +37,11 @@ class CursorField:
36
37
 
37
38
 
38
39
  class Cursor(ABC):
40
+ @property
41
+ @abstractmethod
42
+ def state(self) -> MutableMapping[str, Any]:
43
+ ...
44
+
39
45
  @abstractmethod
40
46
  def observe(self, record: Record) -> None:
41
47
  """
@@ -52,6 +58,10 @@ class Cursor(ABC):
52
58
 
53
59
 
54
60
  class NoopCursor(Cursor):
61
+ @property
62
+ def state(self) -> MutableMapping[str, Any]:
63
+ return {}
64
+
55
65
  def observe(self, record: Record) -> None:
56
66
  pass
57
67
 
@@ -73,6 +83,7 @@ class ConcurrentCursor(Cursor):
73
83
  connector_state_converter: AbstractStreamStateConverter,
74
84
  cursor_field: CursorField,
75
85
  slice_boundary_fields: Optional[Tuple[str, str]],
86
+ start: Optional[Any],
76
87
  ) -> None:
77
88
  self._stream_name = stream_name
78
89
  self._stream_namespace = stream_namespace
@@ -82,9 +93,19 @@ class ConcurrentCursor(Cursor):
82
93
  self._cursor_field = cursor_field
83
94
  # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379
84
95
  self._slice_boundary_fields = slice_boundary_fields if slice_boundary_fields else tuple()
96
+ self._start = start
85
97
  self._most_recent_record: Optional[Record] = None
86
98
  self._has_closed_at_least_one_slice = False
87
- self.state = stream_state
99
+ self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
100
+
101
+ @property
102
+ def state(self) -> MutableMapping[str, Any]:
103
+ return self._concurrent_state
104
+
105
+ def _get_concurrent_state(self, state: MutableMapping[str, Any]) -> Tuple[datetime, MutableMapping[str, Any]]:
106
+ if self._connector_state_converter.is_state_message_compatible(state):
107
+ return self._start or self._connector_state_converter.zero_value, self._connector_state_converter.deserialize(state)
108
+ return self._connector_state_converter.convert_from_sequential_state(self._cursor_field, state, self._start)
88
109
 
89
110
  def observe(self, record: Record) -> None:
90
111
  if self._slice_boundary_fields:
@@ -102,7 +123,7 @@ class ConcurrentCursor(Cursor):
102
123
  def close_partition(self, partition: Partition) -> None:
103
124
  slice_count_before = len(self.state.get("slices", []))
104
125
  self._add_slice_to_state(partition)
105
- if slice_count_before < len(self.state["slices"]):
126
+ if slice_count_before < len(self.state["slices"]): # only emit if at least one slice has been processed
106
127
  self._merge_partitions()
107
128
  self._emit_state_message()
108
129
  self._has_closed_at_least_one_slice = True
@@ -110,7 +131,9 @@ class ConcurrentCursor(Cursor):
110
131
  def _add_slice_to_state(self, partition: Partition) -> None:
111
132
  if self._slice_boundary_fields:
112
133
  if "slices" not in self.state:
113
- self.state["slices"] = []
134
+ raise RuntimeError(
135
+ f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
136
+ )
114
137
  self.state["slices"].append(
115
138
  {
116
139
  "start": self._extract_from_slice(partition, self._slice_boundary_fields[self._START_BOUNDARY]),
@@ -126,10 +149,8 @@ class ConcurrentCursor(Cursor):
126
149
 
127
150
  self.state["slices"].append(
128
151
  {
129
- # TODO: if we migrate stored state to the concurrent state format, we may want this to be the config start date
130
- # instead of zero_value.
131
- "start": self._connector_state_converter.zero_value,
132
- "end": self._extract_cursor_value(self._most_recent_record),
152
+ self._connector_state_converter.START_KEY: self.start,
153
+ self._connector_state_converter.END_KEY: self._extract_cursor_value(self._most_recent_record),
133
154
  }
134
155
  )
135
156
 
@@ -2,11 +2,9 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from queue import Queue
6
-
7
5
  from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
8
6
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
9
- from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
7
+ from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
10
8
 
11
9
 
12
10
  class PartitionEnqueuer:
@@ -14,10 +12,10 @@ class PartitionEnqueuer:
14
12
  Generates partitions from a partition generator and puts them in a queue.
15
13
  """
16
14
 
17
- def __init__(self, queue: Queue[QueueItem]) -> None:
15
+ def __init__(self, queue: ThrottledQueue) -> None:
18
16
  """
19
17
  :param queue: The queue to put the partitions in.
20
- :param sentinel: The sentinel to put in the queue when all the partitions have been generated.
18
+ :param throttler: The throttler to use to throttle the partition generation.
21
19
  """
22
20
  self._queue = queue
23
21
 
@@ -2,10 +2,9 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from queue import Queue
6
-
7
5
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
8
- from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
6
+ from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
7
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel
9
8
 
10
9
 
11
10
  class PartitionReader:
@@ -13,7 +12,7 @@ class PartitionReader:
13
12
  Generates records from a partition and puts them in a queue.
14
13
  """
15
14
 
16
- def __init__(self, queue: Queue[QueueItem]) -> None:
15
+ def __init__(self, queue: ThrottledQueue) -> None:
17
16
  """
18
17
  :param queue: The queue to put the records in.
19
18
  """
@@ -0,0 +1,41 @@
1
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+
3
+ from queue import Queue
4
+
5
+ from airbyte_cdk.sources.concurrent_source.throttler import Throttler
6
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
7
+
8
+
9
+ class ThrottledQueue:
10
+ """
11
+ A queue that throttles the number of items that can be added to it.
12
+
13
+ We throttle the queue using custom logic instead of relying on the queue's max size
14
+ because the main thread can continuously dequeue before submitting a future.
15
+
16
+ Since the main thread doesn't wait, it'll be able to remove items from the queue even if the tasks should be throttled,
17
+ so the tasks won't wait.
18
+
19
+ This class solves this issue by checking if we should throttle the queue before adding an item to it.
20
+ An example implementation of a throttler would check if the number of pending futures is greater than a certain threshold.
21
+ """
22
+
23
+ def __init__(self, queue: Queue[QueueItem], throttler: Throttler, timeout: float) -> None:
24
+ """
25
+ :param queue: The queue to throttle
26
+ :param throttler: The throttler to use to throttle the queue
27
+ :param timeout: The timeout to use when getting items from the queue
28
+ """
29
+ self._queue = queue
30
+ self._throttler = throttler
31
+ self._timeout = timeout
32
+
33
+ def put(self, item: QueueItem) -> None:
34
+ self._throttler.wait_and_acquire()
35
+ self._queue.put(item)
36
+
37
+ def get(self) -> QueueItem:
38
+ return self._queue.get(block=True, timeout=self._timeout)
39
+
40
+ def empty(self) -> bool:
41
+ return self._queue.empty()
@@ -4,7 +4,7 @@
4
4
 
5
5
  from abc import ABC, abstractmethod
6
6
  from enum import Enum
7
- from typing import TYPE_CHECKING, Any, List, MutableMapping, Optional
7
+ from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
@@ -18,15 +18,6 @@ class AbstractStreamStateConverter(ABC):
18
18
  START_KEY = "start"
19
19
  END_KEY = "end"
20
20
 
21
- def get_concurrent_stream_state(
22
- self, cursor_field: Optional["CursorField"], state: MutableMapping[str, Any]
23
- ) -> Optional[MutableMapping[str, Any]]:
24
- if not cursor_field:
25
- return None
26
- if self.is_state_message_compatible(state):
27
- return self.deserialize(state)
28
- return self.convert_from_sequential_state(cursor_field, state)
29
-
30
21
  @abstractmethod
31
22
  def deserialize(self, state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
32
23
  """
@@ -40,8 +31,11 @@ class AbstractStreamStateConverter(ABC):
40
31
 
41
32
  @abstractmethod
42
33
  def convert_from_sequential_state(
43
- self, cursor_field: "CursorField", stream_state: MutableMapping[str, Any]
44
- ) -> MutableMapping[str, Any]:
34
+ self,
35
+ cursor_field: "CursorField",
36
+ stream_state: MutableMapping[str, Any],
37
+ start: Any,
38
+ ) -> Tuple[Any, MutableMapping[str, Any]]:
45
39
  """
46
40
  Convert the state message to the format required by the ConcurrentCursor.
47
41
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  from abc import abstractmethod
6
6
  from datetime import datetime, timedelta
7
- from typing import Any, List, MutableMapping, Optional
7
+ from typing import Any, List, MutableMapping, Optional, Tuple
8
8
 
9
9
  import pendulum
10
10
  from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
@@ -16,9 +16,6 @@ from pendulum.datetime import DateTime
16
16
 
17
17
 
18
18
  class DateTimeStreamStateConverter(AbstractStreamStateConverter):
19
- START_KEY = "start"
20
- END_KEY = "end"
21
-
22
19
  @property
23
20
  @abstractmethod
24
21
  def _zero_value(self) -> Any:
@@ -62,7 +59,7 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
62
59
  for interval in sorted_intervals[1:]:
63
60
  last_end_time = merged_intervals[-1][self.END_KEY]
64
61
  current_start_time = interval[self.START_KEY]
65
- if self.compare_intervals(last_end_time, current_start_time):
62
+ if self._compare_intervals(last_end_time, current_start_time):
66
63
  merged_end_time = max(last_end_time, interval[self.END_KEY])
67
64
  merged_intervals[-1][self.END_KEY] = merged_end_time
68
65
  else:
@@ -70,10 +67,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
70
67
 
71
68
  return merged_intervals
72
69
 
73
- def compare_intervals(self, end_time: Any, start_time: Any) -> bool:
70
+ def _compare_intervals(self, end_time: Any, start_time: Any) -> bool:
74
71
  return bool(self.increment(end_time) >= start_time)
75
72
 
76
- def convert_from_sequential_state(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
73
+ def convert_from_sequential_state(
74
+ self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: datetime
75
+ ) -> Tuple[datetime, MutableMapping[str, Any]]:
77
76
  """
78
77
  Convert the state message to the format required by the ConcurrentCursor.
79
78
 
@@ -82,28 +81,35 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
82
81
  "state_type": ConcurrencyCompatibleStateType.date_range.value,
83
82
  "metadata": { … },
84
83
  "slices": [
85
- {starts: 0, end: "2021-01-18T21:18:20.000+00:00", finished_processing: true}]
84
+ {"start": "2021-01-18T21:18:20.000+00:00", "end": "2021-01-18T21:18:20.000+00:00"},
85
+ ]
86
86
  }
87
87
  """
88
+ sync_start = self._get_sync_start(cursor_field, stream_state, start)
88
89
  if self.is_state_message_compatible(stream_state):
89
- return stream_state
90
- if cursor_field.cursor_field_key in stream_state:
91
- slices = [
92
- {
93
- # TODO: if we migrate stored state to the concurrent state format, we may want this to be the config start date
94
- # instead of `zero_value`
95
- self.START_KEY: self.zero_value,
96
- self.END_KEY: self.parse_timestamp(stream_state[cursor_field.cursor_field_key]),
97
- },
98
- ]
99
- else:
100
- slices = []
101
- return {
90
+ return sync_start, stream_state
91
+
92
+ # Create a slice to represent the records synced during prior syncs.
93
+ # The start and end are the same to avoid confusion as to whether the records for this slice
94
+ # were actually synced
95
+ slices = [{self.START_KEY: sync_start, self.END_KEY: sync_start}]
96
+
97
+ return sync_start, {
102
98
  "state_type": ConcurrencyCompatibleStateType.date_range.value,
103
99
  "slices": slices,
104
100
  "legacy": stream_state,
105
101
  }
106
102
 
103
+ def _get_sync_start(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: Optional[Any]) -> datetime:
104
+ sync_start = self.parse_timestamp(start) if start is not None else self.zero_value
105
+ prev_sync_low_water_mark = (
106
+ self.parse_timestamp(stream_state[cursor_field.cursor_field_key]) if cursor_field.cursor_field_key in stream_state else None
107
+ )
108
+ if prev_sync_low_water_mark and prev_sync_low_water_mark >= sync_start:
109
+ return prev_sync_low_water_mark
110
+ else:
111
+ return sync_start
112
+
107
113
  def convert_to_sequential_state(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
108
114
  """
109
115
  Convert the state message from the concurrency-compatible format to the stream's original format.
@@ -113,10 +119,9 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
113
119
  """
114
120
  if self.is_state_message_compatible(stream_state):
115
121
  legacy_state = stream_state.get("legacy", {})
116
- if slices := stream_state.pop("slices", None):
117
- latest_complete_time = self._get_latest_complete_time(slices)
118
- if latest_complete_time:
119
- legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
122
+ latest_complete_time = self._get_latest_complete_time(stream_state.get("slices", []))
123
+ if latest_complete_time is not None:
124
+ legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
120
125
  return legacy_state or {}
121
126
  else:
122
127
  return stream_state
@@ -125,11 +130,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
125
130
  """
126
131
  Get the latest time before which all records have been processed.
127
132
  """
128
- if slices:
129
- first_interval = self.merge_intervals(slices)[0][self.END_KEY]
130
- return first_interval
131
- else:
132
- return None
133
+ if not slices:
134
+ raise RuntimeError("Expected at least one slice but there were none. This is unexpected; please contact Support.")
135
+
136
+ merged_intervals = self.merge_intervals(slices)
137
+ first_interval = merged_intervals[0]
138
+ return first_interval[self.END_KEY]
133
139
 
134
140
 
135
141
  class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.58.8
3
+ Version: 0.59.0
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte