airbyte-cdk 0.58.8__py3-none-any.whl → 0.59.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (23) hide show
  1. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +20 -21
  2. airbyte_cdk/sources/concurrent_source/concurrent_source.py +4 -3
  3. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +15 -18
  4. airbyte_cdk/sources/concurrent_source/throttler.py +25 -0
  5. airbyte_cdk/sources/streams/concurrent/cursor.py +29 -8
  6. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +3 -5
  7. airbyte_cdk/sources/streams/concurrent/partition_reader.py +3 -4
  8. airbyte_cdk/sources/streams/concurrent/partitions/throttled_queue.py +41 -0
  9. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +6 -12
  10. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +36 -30
  11. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/METADATA +1 -1
  12. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/RECORD +23 -19
  13. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +2 -2
  14. unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +4 -10
  15. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +82 -12
  16. unit_tests/sources/streams/concurrent/test_cursor.py +20 -3
  17. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +166 -268
  18. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +2 -15
  19. unit_tests/sources/streams/concurrent/test_throttled_queue.py +65 -0
  20. unit_tests/sources/streams/concurrent/test_throttler.py +13 -0
  21. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/WHEEL +0 -0
  23. {airbyte_cdk-0.58.8.dist-info → airbyte_cdk-0.59.0.dist-info}/top_level.txt +0 -0
@@ -43,9 +43,9 @@ class ConcurrentReadProcessor:
43
43
  """
44
44
  self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from}
45
45
  self._record_counter = {}
46
- self._streams_to_partitions: Dict[str, Set[Partition]] = {}
46
+ self._streams_to_running_partitions: Dict[str, Set[Partition]] = {}
47
47
  for stream in stream_instances_to_read_from:
48
- self._streams_to_partitions[stream.name] = set()
48
+ self._streams_to_running_partitions[stream.name] = set()
49
49
  self._record_counter[stream.name] = 0
50
50
  self._thread_pool_manager = thread_pool_manager
51
51
  self._partition_enqueuer = partition_enqueuer
@@ -55,6 +55,7 @@ class ConcurrentReadProcessor:
55
55
  self._slice_logger = slice_logger
56
56
  self._message_repository = message_repository
57
57
  self._partition_reader = partition_reader
58
+ self._streams_done: Set[str] = set()
58
59
 
59
60
  def on_partition_generation_completed(self, sentinel: PartitionGenerationCompletedSentinel) -> Iterable[AirbyteMessage]:
60
61
  """
@@ -67,7 +68,8 @@ class ConcurrentReadProcessor:
67
68
  self._streams_currently_generating_partitions.remove(sentinel.stream.name)
68
69
  ret = []
69
70
  # It is possible for the stream to already be done if no partitions were generated
70
- if self._is_stream_done(stream_name):
71
+ # If the partition generation process was completed and there are no partitions left to process, the stream is done
72
+ if self._is_stream_done(stream_name) or len(self._streams_to_running_partitions[stream_name]) == 0:
71
73
  ret.append(self._on_stream_is_done(stream_name))
72
74
  if self._stream_instances_to_start_partition_generation:
73
75
  ret.append(self.start_next_partition_generator())
@@ -81,7 +83,7 @@ class ConcurrentReadProcessor:
81
83
  3. Submit the partition to the thread pool manager
82
84
  """
83
85
  stream_name = partition.stream_name()
84
- self._streams_to_partitions[stream_name].add(partition)
86
+ self._streams_to_running_partitions[stream_name].add(partition)
85
87
  if self._slice_logger.should_log_slice_message(self._logger):
86
88
  self._message_repository.emit_message(self._slice_logger.create_slice_log_message(partition.to_slice()))
87
89
  self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
@@ -95,8 +97,12 @@ class ConcurrentReadProcessor:
95
97
  """
96
98
  partition = sentinel.partition
97
99
  partition.close()
98
- if self._is_stream_done(partition.stream_name()):
99
- yield self._on_stream_is_done(partition.stream_name())
100
+ partitions_running = self._streams_to_running_partitions[partition.stream_name()]
101
+ if partition in partitions_running:
102
+ partitions_running.remove(partition)
103
+ # If all partitions were generated and this was the last one, the stream is done
104
+ if partition.stream_name() not in self._streams_currently_generating_partitions and len(partitions_running) == 0:
105
+ yield self._on_stream_is_done(partition.stream_name())
100
106
  yield from self._message_repository.consume_queue()
101
107
 
102
108
  def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
@@ -114,11 +120,10 @@ class ConcurrentReadProcessor:
114
120
  message = stream_data_to_airbyte_message(record.stream_name, record.data)
115
121
  stream = self._stream_name_to_instance[record.stream_name]
116
122
 
117
- if self._record_counter[stream.name] == 0:
118
- self._logger.info(f"Marking stream {stream.name} as RUNNING")
119
- yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
120
-
121
123
  if message.type == MessageType.RECORD:
124
+ if self._record_counter[stream.name] == 0:
125
+ self._logger.info(f"Marking stream {stream.name} as RUNNING")
126
+ yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING)
122
127
  self._record_counter[stream.name] += 1
123
128
  yield message
124
129
  yield from self._message_repository.consume_queue()
@@ -161,30 +166,24 @@ class ConcurrentReadProcessor:
161
166
  2. There are no more streams to read from
162
167
  3. All partitions for all streams are closed
163
168
  """
164
- return (
165
- not self._streams_currently_generating_partitions
166
- and not self._stream_instances_to_start_partition_generation
167
- and all([all(p.is_closed() for p in partitions) for partitions in self._streams_to_partitions.values()])
168
- )
169
+ return all([self._is_stream_done(stream_name) for stream_name in self._stream_name_to_instance.keys()])
169
170
 
170
171
  def _is_stream_done(self, stream_name: str) -> bool:
171
- return (
172
- all([p.is_closed() for p in self._streams_to_partitions[stream_name]])
173
- and stream_name not in self._streams_currently_generating_partitions
174
- )
172
+ return stream_name in self._streams_done
175
173
 
176
174
  def _on_stream_is_done(self, stream_name: str) -> AirbyteMessage:
177
175
  self._logger.info(f"Read {self._record_counter[stream_name]} records from {stream_name} stream")
178
176
  self._logger.info(f"Marking stream {stream_name} as STOPPED")
179
177
  stream = self._stream_name_to_instance[stream_name]
180
178
  self._logger.info(f"Finished syncing {stream.name}")
179
+ self._streams_done.add(stream_name)
181
180
  return stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.COMPLETE)
182
181
 
183
182
  def _stop_streams(self) -> Iterable[AirbyteMessage]:
184
183
  self._thread_pool_manager.shutdown()
185
- for stream_name, partitions in self._streams_to_partitions.items():
184
+ for stream_name in self._streams_to_running_partitions.keys():
186
185
  stream = self._stream_name_to_instance[stream_name]
187
- if not all([p.is_closed() for p in partitions]):
186
+ if not self._is_stream_done(stream_name):
188
187
  self._logger.info(f"Marking stream {stream.name} as STOPPED")
189
188
  self._logger.info(f"Finished syncing {stream.name}")
190
189
  yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), AirbyteStreamStatus.INCOMPLETE)
@@ -16,6 +16,7 @@ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionE
16
16
  from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
17
17
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
18
18
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
19
+ from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
19
20
  from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
20
21
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
21
22
 
@@ -82,7 +83,7 @@ class ConcurrentSource:
82
83
  if not stream_instances_to_read_from:
83
84
  return
84
85
 
85
- queue: Queue[QueueItem] = Queue()
86
+ queue: ThrottledQueue = ThrottledQueue(Queue(), self._threadpool.get_throttler(), self._timeout_seconds)
86
87
  concurrent_stream_processor = ConcurrentReadProcessor(
87
88
  stream_instances_to_read_from,
88
89
  PartitionEnqueuer(queue),
@@ -112,10 +113,10 @@ class ConcurrentSource:
112
113
 
113
114
  def _consume_from_queue(
114
115
  self,
115
- queue: Queue[QueueItem],
116
+ queue: ThrottledQueue,
116
117
  concurrent_stream_processor: ConcurrentReadProcessor,
117
118
  ) -> Iterable[AirbyteMessage]:
118
- while airbyte_message_or_record_or_exception := queue.get(block=True, timeout=self._timeout_seconds):
119
+ while airbyte_message_or_record_or_exception := queue.get():
119
120
  yield from self._handle_item(
120
121
  airbyte_message_or_record_or_exception,
121
122
  concurrent_stream_processor,
@@ -2,10 +2,11 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
- import time
6
5
  from concurrent.futures import Future, ThreadPoolExecutor
7
6
  from typing import Any, Callable, List
8
7
 
8
+ from airbyte_cdk.sources.concurrent_source.throttler import Throttler
9
+
9
10
 
10
11
  class ThreadPoolManager:
11
12
  """
@@ -19,8 +20,8 @@ class ThreadPoolManager:
19
20
  self,
20
21
  threadpool: ThreadPoolExecutor,
21
22
  logger: logging.Logger,
22
- max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
23
23
  sleep_time: float = DEFAULT_SLEEP_TIME,
24
+ max_concurrent_tasks: int = DEFAULT_MAX_QUEUE_SIZE,
24
25
  ):
25
26
  """
26
27
  :param threadpool: The threadpool to use
@@ -31,23 +32,17 @@ class ThreadPoolManager:
31
32
  self._threadpool = threadpool
32
33
  self._logger = logger
33
34
  self._max_concurrent_tasks = max_concurrent_tasks
34
- self._sleep_time = sleep_time
35
35
  self._futures: List[Future[Any]] = []
36
+ self._throttler = Throttler(self._futures, sleep_time, max_concurrent_tasks)
37
+
38
+ def get_throttler(self) -> Throttler:
39
+ return self._throttler
36
40
 
37
41
  def submit(self, function: Callable[..., Any], *args: Any) -> None:
38
- # Submit a task to the threadpool, waiting if there are too many pending tasks
39
- self._wait_while_too_many_pending_futures(self._futures)
42
+ # Submit a task to the threadpool, removing completed tasks if there are too many tasks in self._futures.
43
+ self._prune_futures(self._futures)
40
44
  self._futures.append(self._threadpool.submit(function, *args))
41
45
 
42
- def _wait_while_too_many_pending_futures(self, futures: List[Future[Any]]) -> None:
43
- # Wait until the number of pending tasks is < self._max_concurrent_tasks
44
- while True:
45
- self._prune_futures(futures)
46
- if len(futures) < self._max_concurrent_tasks:
47
- break
48
- self._logger.info("Main thread is sleeping because the task queue is full...")
49
- time.sleep(self._sleep_time)
50
-
51
46
  def _prune_futures(self, futures: List[Future[Any]]) -> None:
52
47
  """
53
48
  Take a list in input and remove the futures that are completed. If a future has an exception, it'll raise and kill the stream
@@ -60,12 +55,14 @@ class ThreadPoolManager:
60
55
 
61
56
  for index in reversed(range(len(futures))):
62
57
  future = futures[index]
63
- optional_exception = future.exception()
64
- if optional_exception:
65
- exception = RuntimeError(f"Failed reading with error: {optional_exception}")
66
- self._stop_and_raise_exception(exception)
67
58
 
68
59
  if future.done():
60
+ # Only call future.exception() if the future is known to be done because it will block until the future is done.
61
+ # See https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.Future.exception
62
+ optional_exception = future.exception()
63
+ if optional_exception:
64
+ exception = RuntimeError(f"Failed reading with error: {optional_exception}")
65
+ self._stop_and_raise_exception(exception)
69
66
  futures.pop(index)
70
67
 
71
68
  def shutdown(self) -> None:
@@ -0,0 +1,25 @@
1
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+
3
+ import time
4
+ from concurrent.futures import Future
5
+ from typing import Any, List
6
+
7
+
8
+ class Throttler:
9
+ """
10
+ A throttler that waits until the number of concurrent tasks is below a certain threshold.
11
+ """
12
+
13
+ def __init__(self, futures_list: List[Future[Any]], sleep_time: float, max_concurrent_tasks: int):
14
+ """
15
+ :param futures_list: The list of futures to monitor
16
+ :param sleep_time: How long to sleep if there are too many pending tasks
17
+ :param max_concurrent_tasks: The maximum number of tasks that can be pending at the same time
18
+ """
19
+ self._futures_list = futures_list
20
+ self._sleep_time = sleep_time
21
+ self._max_concurrent_tasks = max_concurrent_tasks
22
+
23
+ def wait_and_acquire(self) -> None:
24
+ while len(self._futures_list) >= self._max_concurrent_tasks:
25
+ time.sleep(self._sleep_time)
@@ -3,7 +3,8 @@
3
3
  #
4
4
  import functools
5
5
  from abc import ABC, abstractmethod
6
- from typing import Any, List, Mapping, Optional, Protocol, Tuple
6
+ from datetime import datetime
7
+ from typing import Any, List, Mapping, MutableMapping, Optional, Protocol, Tuple
7
8
 
8
9
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
9
10
  from airbyte_cdk.sources.message import MessageRepository
@@ -36,6 +37,11 @@ class CursorField:
36
37
 
37
38
 
38
39
  class Cursor(ABC):
40
+ @property
41
+ @abstractmethod
42
+ def state(self) -> MutableMapping[str, Any]:
43
+ ...
44
+
39
45
  @abstractmethod
40
46
  def observe(self, record: Record) -> None:
41
47
  """
@@ -52,6 +58,10 @@ class Cursor(ABC):
52
58
 
53
59
 
54
60
  class NoopCursor(Cursor):
61
+ @property
62
+ def state(self) -> MutableMapping[str, Any]:
63
+ return {}
64
+
55
65
  def observe(self, record: Record) -> None:
56
66
  pass
57
67
 
@@ -73,6 +83,7 @@ class ConcurrentCursor(Cursor):
73
83
  connector_state_converter: AbstractStreamStateConverter,
74
84
  cursor_field: CursorField,
75
85
  slice_boundary_fields: Optional[Tuple[str, str]],
86
+ start: Optional[Any],
76
87
  ) -> None:
77
88
  self._stream_name = stream_name
78
89
  self._stream_namespace = stream_namespace
@@ -82,9 +93,19 @@ class ConcurrentCursor(Cursor):
82
93
  self._cursor_field = cursor_field
83
94
  # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379
84
95
  self._slice_boundary_fields = slice_boundary_fields if slice_boundary_fields else tuple()
96
+ self._start = start
85
97
  self._most_recent_record: Optional[Record] = None
86
98
  self._has_closed_at_least_one_slice = False
87
- self.state = stream_state
99
+ self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
100
+
101
+ @property
102
+ def state(self) -> MutableMapping[str, Any]:
103
+ return self._concurrent_state
104
+
105
+ def _get_concurrent_state(self, state: MutableMapping[str, Any]) -> Tuple[datetime, MutableMapping[str, Any]]:
106
+ if self._connector_state_converter.is_state_message_compatible(state):
107
+ return self._start or self._connector_state_converter.zero_value, self._connector_state_converter.deserialize(state)
108
+ return self._connector_state_converter.convert_from_sequential_state(self._cursor_field, state, self._start)
88
109
 
89
110
  def observe(self, record: Record) -> None:
90
111
  if self._slice_boundary_fields:
@@ -102,7 +123,7 @@ class ConcurrentCursor(Cursor):
102
123
  def close_partition(self, partition: Partition) -> None:
103
124
  slice_count_before = len(self.state.get("slices", []))
104
125
  self._add_slice_to_state(partition)
105
- if slice_count_before < len(self.state["slices"]):
126
+ if slice_count_before < len(self.state["slices"]): # only emit if at least one slice has been processed
106
127
  self._merge_partitions()
107
128
  self._emit_state_message()
108
129
  self._has_closed_at_least_one_slice = True
@@ -110,7 +131,9 @@ class ConcurrentCursor(Cursor):
110
131
  def _add_slice_to_state(self, partition: Partition) -> None:
111
132
  if self._slice_boundary_fields:
112
133
  if "slices" not in self.state:
113
- self.state["slices"] = []
134
+ raise RuntimeError(
135
+ f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
136
+ )
114
137
  self.state["slices"].append(
115
138
  {
116
139
  "start": self._extract_from_slice(partition, self._slice_boundary_fields[self._START_BOUNDARY]),
@@ -126,10 +149,8 @@ class ConcurrentCursor(Cursor):
126
149
 
127
150
  self.state["slices"].append(
128
151
  {
129
- # TODO: if we migrate stored state to the concurrent state format, we may want this to be the config start date
130
- # instead of zero_value.
131
- "start": self._connector_state_converter.zero_value,
132
- "end": self._extract_cursor_value(self._most_recent_record),
152
+ self._connector_state_converter.START_KEY: self.start,
153
+ self._connector_state_converter.END_KEY: self._extract_cursor_value(self._most_recent_record),
133
154
  }
134
155
  )
135
156
 
@@ -2,11 +2,9 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from queue import Queue
6
-
7
5
  from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import PartitionGenerationCompletedSentinel
8
6
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
9
- from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
7
+ from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
10
8
 
11
9
 
12
10
  class PartitionEnqueuer:
@@ -14,10 +12,10 @@ class PartitionEnqueuer:
14
12
  Generates partitions from a partition generator and puts them in a queue.
15
13
  """
16
14
 
17
- def __init__(self, queue: Queue[QueueItem]) -> None:
15
+ def __init__(self, queue: ThrottledQueue) -> None:
18
16
  """
19
17
  :param queue: The queue to put the partitions in.
20
- :param sentinel: The sentinel to put in the queue when all the partitions have been generated.
18
+ :param throttler: The throttler to use to throttle the partition generation.
21
19
  """
22
20
  self._queue = queue
23
21
 
@@ -2,10 +2,9 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from queue import Queue
6
-
7
5
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
8
- from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel, QueueItem
6
+ from airbyte_cdk.sources.streams.concurrent.partitions.throttled_queue import ThrottledQueue
7
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel
9
8
 
10
9
 
11
10
  class PartitionReader:
@@ -13,7 +12,7 @@ class PartitionReader:
13
12
  Generates records from a partition and puts them in a queue.
14
13
  """
15
14
 
16
- def __init__(self, queue: Queue[QueueItem]) -> None:
15
+ def __init__(self, queue: ThrottledQueue) -> None:
17
16
  """
18
17
  :param queue: The queue to put the records in.
19
18
  """
@@ -0,0 +1,41 @@
1
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+
3
+ from queue import Queue
4
+
5
+ from airbyte_cdk.sources.concurrent_source.throttler import Throttler
6
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
7
+
8
+
9
+ class ThrottledQueue:
10
+ """
11
+ A queue that throttles the number of items that can be added to it.
12
+
13
+ We throttle the queue using custom logic instead of relying on the queue's max size
14
+ because the main thread can continuously dequeue before submitting a future.
15
+
16
+ Since the main thread doesn't wait, it'll be able to remove items from the queue even if the tasks should be throttled,
17
+ so the tasks won't wait.
18
+
19
+ This class solves this issue by checking if we should throttle the queue before adding an item to it.
20
+ An example implementation of a throttler would check if the number of pending futures is greater than a certain threshold.
21
+ """
22
+
23
+ def __init__(self, queue: Queue[QueueItem], throttler: Throttler, timeout: float) -> None:
24
+ """
25
+ :param queue: The queue to throttle
26
+ :param throttler: The throttler to use to throttle the queue
27
+ :param timeout: The timeout to use when getting items from the queue
28
+ """
29
+ self._queue = queue
30
+ self._throttler = throttler
31
+ self._timeout = timeout
32
+
33
+ def put(self, item: QueueItem) -> None:
34
+ self._throttler.wait_and_acquire()
35
+ self._queue.put(item)
36
+
37
+ def get(self) -> QueueItem:
38
+ return self._queue.get(block=True, timeout=self._timeout)
39
+
40
+ def empty(self) -> bool:
41
+ return self._queue.empty()
@@ -4,7 +4,7 @@
4
4
 
5
5
  from abc import ABC, abstractmethod
6
6
  from enum import Enum
7
- from typing import TYPE_CHECKING, Any, List, MutableMapping, Optional
7
+ from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
@@ -18,15 +18,6 @@ class AbstractStreamStateConverter(ABC):
18
18
  START_KEY = "start"
19
19
  END_KEY = "end"
20
20
 
21
- def get_concurrent_stream_state(
22
- self, cursor_field: Optional["CursorField"], state: MutableMapping[str, Any]
23
- ) -> Optional[MutableMapping[str, Any]]:
24
- if not cursor_field:
25
- return None
26
- if self.is_state_message_compatible(state):
27
- return self.deserialize(state)
28
- return self.convert_from_sequential_state(cursor_field, state)
29
-
30
21
  @abstractmethod
31
22
  def deserialize(self, state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
32
23
  """
@@ -40,8 +31,11 @@ class AbstractStreamStateConverter(ABC):
40
31
 
41
32
  @abstractmethod
42
33
  def convert_from_sequential_state(
43
- self, cursor_field: "CursorField", stream_state: MutableMapping[str, Any]
44
- ) -> MutableMapping[str, Any]:
34
+ self,
35
+ cursor_field: "CursorField",
36
+ stream_state: MutableMapping[str, Any],
37
+ start: Any,
38
+ ) -> Tuple[Any, MutableMapping[str, Any]]:
45
39
  """
46
40
  Convert the state message to the format required by the ConcurrentCursor.
47
41
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  from abc import abstractmethod
6
6
  from datetime import datetime, timedelta
7
- from typing import Any, List, MutableMapping, Optional
7
+ from typing import Any, List, MutableMapping, Optional, Tuple
8
8
 
9
9
  import pendulum
10
10
  from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
@@ -16,9 +16,6 @@ from pendulum.datetime import DateTime
16
16
 
17
17
 
18
18
  class DateTimeStreamStateConverter(AbstractStreamStateConverter):
19
- START_KEY = "start"
20
- END_KEY = "end"
21
-
22
19
  @property
23
20
  @abstractmethod
24
21
  def _zero_value(self) -> Any:
@@ -62,7 +59,7 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
62
59
  for interval in sorted_intervals[1:]:
63
60
  last_end_time = merged_intervals[-1][self.END_KEY]
64
61
  current_start_time = interval[self.START_KEY]
65
- if self.compare_intervals(last_end_time, current_start_time):
62
+ if self._compare_intervals(last_end_time, current_start_time):
66
63
  merged_end_time = max(last_end_time, interval[self.END_KEY])
67
64
  merged_intervals[-1][self.END_KEY] = merged_end_time
68
65
  else:
@@ -70,10 +67,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
70
67
 
71
68
  return merged_intervals
72
69
 
73
- def compare_intervals(self, end_time: Any, start_time: Any) -> bool:
70
+ def _compare_intervals(self, end_time: Any, start_time: Any) -> bool:
74
71
  return bool(self.increment(end_time) >= start_time)
75
72
 
76
- def convert_from_sequential_state(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
73
+ def convert_from_sequential_state(
74
+ self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: datetime
75
+ ) -> Tuple[datetime, MutableMapping[str, Any]]:
77
76
  """
78
77
  Convert the state message to the format required by the ConcurrentCursor.
79
78
 
@@ -82,28 +81,35 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
82
81
  "state_type": ConcurrencyCompatibleStateType.date_range.value,
83
82
  "metadata": { … },
84
83
  "slices": [
85
- {starts: 0, end: "2021-01-18T21:18:20.000+00:00", finished_processing: true}]
84
+ {"start": "2021-01-18T21:18:20.000+00:00", "end": "2021-01-18T21:18:20.000+00:00"},
85
+ ]
86
86
  }
87
87
  """
88
+ sync_start = self._get_sync_start(cursor_field, stream_state, start)
88
89
  if self.is_state_message_compatible(stream_state):
89
- return stream_state
90
- if cursor_field.cursor_field_key in stream_state:
91
- slices = [
92
- {
93
- # TODO: if we migrate stored state to the concurrent state format, we may want this to be the config start date
94
- # instead of `zero_value`
95
- self.START_KEY: self.zero_value,
96
- self.END_KEY: self.parse_timestamp(stream_state[cursor_field.cursor_field_key]),
97
- },
98
- ]
99
- else:
100
- slices = []
101
- return {
90
+ return sync_start, stream_state
91
+
92
+ # Create a slice to represent the records synced during prior syncs.
93
+ # The start and end are the same to avoid confusion as to whether the records for this slice
94
+ # were actually synced
95
+ slices = [{self.START_KEY: sync_start, self.END_KEY: sync_start}]
96
+
97
+ return sync_start, {
102
98
  "state_type": ConcurrencyCompatibleStateType.date_range.value,
103
99
  "slices": slices,
104
100
  "legacy": stream_state,
105
101
  }
106
102
 
103
+ def _get_sync_start(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any], start: Optional[Any]) -> datetime:
104
+ sync_start = self.parse_timestamp(start) if start is not None else self.zero_value
105
+ prev_sync_low_water_mark = (
106
+ self.parse_timestamp(stream_state[cursor_field.cursor_field_key]) if cursor_field.cursor_field_key in stream_state else None
107
+ )
108
+ if prev_sync_low_water_mark and prev_sync_low_water_mark >= sync_start:
109
+ return prev_sync_low_water_mark
110
+ else:
111
+ return sync_start
112
+
107
113
  def convert_to_sequential_state(self, cursor_field: CursorField, stream_state: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
108
114
  """
109
115
  Convert the state message from the concurrency-compatible format to the stream's original format.
@@ -113,10 +119,9 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
113
119
  """
114
120
  if self.is_state_message_compatible(stream_state):
115
121
  legacy_state = stream_state.get("legacy", {})
116
- if slices := stream_state.pop("slices", None):
117
- latest_complete_time = self._get_latest_complete_time(slices)
118
- if latest_complete_time:
119
- legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
122
+ latest_complete_time = self._get_latest_complete_time(stream_state.get("slices", []))
123
+ if latest_complete_time is not None:
124
+ legacy_state.update({cursor_field.cursor_field_key: self.output_format(latest_complete_time)})
120
125
  return legacy_state or {}
121
126
  else:
122
127
  return stream_state
@@ -125,11 +130,12 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
125
130
  """
126
131
  Get the latest time before which all records have been processed.
127
132
  """
128
- if slices:
129
- first_interval = self.merge_intervals(slices)[0][self.END_KEY]
130
- return first_interval
131
- else:
132
- return None
133
+ if not slices:
134
+ raise RuntimeError("Expected at least one slice but there were none. This is unexpected; please contact Support.")
135
+
136
+ merged_intervals = self.merge_intervals(slices)
137
+ first_interval = merged_intervals[0]
138
+ return first_interval[self.END_KEY]
133
139
 
134
140
 
135
141
  class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.58.8
3
+ Version: 0.59.0
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte