airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  4. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  5. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  6. airbyte_cdk/sources/streams/__init__.py +2 -2
  7. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  8. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  9. airbyte_cdk/sources/streams/core.py +36 -34
  10. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +1 -1
  11. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +28 -28
  12. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  13. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  14. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  15. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  16. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  17. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  18. unit_tests/sources/file_based/test_scenarios.py +2 -2
  19. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  20. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  21. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  22. unit_tests/sources/streams/test_stream_read.py +221 -11
  23. unit_tests/sources/test_abstract_source.py +142 -130
  24. unit_tests/sources/test_connector_state_manager.py +3 -124
  25. unit_tests/sources/test_source.py +18 -14
  26. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
  27. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
  28. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
23
23
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
24
24
  from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
25
25
  from airbyte_cdk.sources.source import Source
26
- from airbyte_cdk.sources.streams import Stream
26
+ from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
27
27
  from airbyte_cdk.sources.streams.core import StreamData
28
28
  from airbyte_cdk.sources.streams.http.http import HttpStream
29
29
  from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
181
181
  def raise_exception_on_missing_stream(self) -> bool:
182
182
  return True
183
183
 
184
- @property
185
- def per_stream_state_enabled(self) -> bool:
186
- return True
187
-
188
184
  def _read_stream(
189
185
  self,
190
186
  logger: logging.Logger,
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
206
202
  )
207
203
  stream_instance.log_stream_sync_configuration()
208
204
 
209
- use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
210
- if use_incremental:
211
- record_iterator = self._read_incremental(
212
- logger,
213
- stream_instance,
214
- configured_stream,
215
- state_manager,
216
- internal_config,
217
- )
218
- else:
219
- record_iterator = self._read_full_refresh(logger, stream_instance, configured_stream, internal_config)
205
+ stream_name = configured_stream.stream.name
206
+ # The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
207
+ # connector, but right now we need to prevent accidental usage of the previous stream state
208
+ stream_state = (
209
+ state_manager.get_stream_state(stream_name, stream_instance.namespace)
210
+ if configured_stream.sync_mode == SyncMode.incremental
211
+ else {}
212
+ )
213
+
214
+ if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
215
+ stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
216
+ logger.info(f"Setting state of {self.name} stream to {stream_state}")
217
+
218
+ record_iterator = stream_instance.read(
219
+ configured_stream,
220
+ logger,
221
+ self._slice_logger,
222
+ stream_state,
223
+ state_manager,
224
+ internal_config,
225
+ )
220
226
 
221
227
  record_counter = 0
222
- stream_name = configured_stream.stream.name
223
228
  logger.info(f"Syncing stream: {stream_name} ")
224
- for record in record_iterator:
229
+ for record_data_or_message in record_iterator:
230
+ record = self._get_message(record_data_or_message, stream_instance)
225
231
  if record.type == MessageType.RECORD:
226
232
  record_counter += 1
227
233
  if record_counter == 1:
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
233
239
 
234
240
  logger.info(f"Read {record_counter} records from {stream_name} stream")
235
241
 
236
- def _read_incremental(
237
- self,
238
- logger: logging.Logger,
239
- stream_instance: Stream,
240
- configured_stream: ConfiguredAirbyteStream,
241
- state_manager: ConnectorStateManager,
242
- internal_config: InternalConfig,
243
- ) -> Iterator[AirbyteMessage]:
244
- """Read stream using incremental algorithm
245
-
246
- :param logger:
247
- :param stream_instance:
248
- :param configured_stream:
249
- :param state_manager:
250
- :param internal_config:
251
- :return:
252
- """
253
- stream_name = configured_stream.stream.name
254
- stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
255
-
256
- if stream_state and "state" in dir(stream_instance):
257
- stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
258
- logger.info(f"Setting state of {self.name} stream to {stream_state}")
259
-
260
- for record_data_or_message in stream_instance.read_incremental(
261
- configured_stream.cursor_field,
262
- logger,
263
- self._slice_logger,
264
- stream_state,
265
- state_manager,
266
- self.per_stream_state_enabled,
267
- internal_config,
268
- ):
269
- yield self._get_message(record_data_or_message, stream_instance)
270
-
271
242
  def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
272
243
  if self.message_repository:
273
244
  yield from self.message_repository.consume_queue()
274
245
  return
275
246
 
276
- def _read_full_refresh(
277
- self,
278
- logger: logging.Logger,
279
- stream_instance: Stream,
280
- configured_stream: ConfiguredAirbyteStream,
281
- internal_config: InternalConfig,
282
- ) -> Iterator[AirbyteMessage]:
283
- total_records_counter = 0
284
- for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
285
- message = self._get_message(record_data_or_message, stream_instance)
286
- yield message
287
- if message.type == MessageType.RECORD:
288
- total_records_counter += 1
289
- if internal_config.is_limit_reached(total_records_counter):
290
- return
291
-
292
247
  def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
293
248
  """
294
249
  Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
317
272
  def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
318
273
  failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
319
274
  return f"During the sync, the following streams did not sync successfully: {failures}"
275
+
276
+ @staticmethod
277
+ def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
278
+ # For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
279
+ # This key is never used by a connector and is needed during a read to skip assigning the incoming state.
280
+ return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
@@ -77,7 +77,7 @@ class ConnectorStateManager:
77
77
  stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
78
78
  self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
79
79
 
80
- def create_state_message(self, stream_name: str, namespace: Optional[str], send_per_stream_state: bool) -> AirbyteMessage:
80
+ def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
81
81
  """
82
82
  Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
83
83
  :param stream_name: The name of the stream for the message that is being created
@@ -85,25 +85,18 @@ class ConnectorStateManager:
85
85
  :param send_per_stream_state: Decides which state format the message should be generated as
86
86
  :return: The Airbyte state message to be emitted by the connector during a sync
87
87
  """
88
- if send_per_stream_state:
89
- hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
90
- stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
91
-
92
- # According to the Airbyte protocol, the StreamDescriptor namespace field is not required. However, the platform will throw
93
- # a validation error if it receives namespace=null. That is why if namespace is None, the field should be omitted instead.
94
- stream_descriptor = (
95
- StreamDescriptor(name=stream_name) if namespace is None else StreamDescriptor(name=stream_name, namespace=namespace)
96
- )
97
-
98
- return AirbyteMessage(
99
- type=MessageType.STATE,
100
- state=AirbyteStateMessage(
101
- type=AirbyteStateType.STREAM,
102
- stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
103
- data=dict(self._get_legacy_state()),
88
+ hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
89
+ stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
90
+
91
+ return AirbyteMessage(
92
+ type=MessageType.STATE,
93
+ state=AirbyteStateMessage(
94
+ type=AirbyteStateType.STREAM,
95
+ stream=AirbyteStreamState(
96
+ stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
104
97
  ),
105
- )
106
- return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=dict(self._get_legacy_state())))
98
+ ),
99
+ )
107
100
 
108
101
  @classmethod
109
102
  def _extract_from_state_message(
@@ -176,13 +169,6 @@ class ConnectorStateManager:
176
169
  streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
177
170
  return streams
178
171
 
179
- def _get_legacy_state(self) -> Mapping[str, Any]:
180
- """
181
- Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
182
- :return: A deep copy of the mapping of stream name to stream state value
183
- """
184
- return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
185
-
186
172
  @staticmethod
187
173
  def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
188
174
  return isinstance(state, dict)
@@ -5,7 +5,7 @@
5
5
  import json
6
6
  import logging
7
7
  import os
8
- from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
8
+ from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
9
9
  from urllib.parse import unquote
10
10
 
11
11
  import pyarrow as pa
@@ -16,7 +16,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
16
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
17
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
18
18
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
19
- from pyarrow import Scalar
19
+ from pyarrow import DictionaryArray, Scalar
20
20
 
21
21
 
22
22
  class ParquetParser(FileTypeParser):
@@ -95,10 +95,23 @@ class ParquetParser(FileTypeParser):
95
95
  return FileReadMode.READ_BINARY
96
96
 
97
97
  @staticmethod
98
- def _to_output_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
98
+ def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
99
+ """
100
+ Convert an entry in a pyarrow table to a value that can be output by the source.
101
+ """
102
+ if isinstance(parquet_value, DictionaryArray):
103
+ return ParquetParser._dictionary_array_to_python_value(parquet_value)
104
+ else:
105
+ return ParquetParser._scalar_to_python_value(parquet_value, parquet_format)
106
+
107
+ @staticmethod
108
+ def _scalar_to_python_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
99
109
  """
100
110
  Convert a pyarrow scalar to a value that can be output by the source.
101
111
  """
112
+ if parquet_value.as_py() is None:
113
+ return None
114
+
102
115
  # Convert date and datetime objects to isoformat strings
103
116
  if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
104
117
  return parquet_value.as_py().isoformat()
@@ -109,23 +122,14 @@ class ParquetParser(FileTypeParser):
109
122
 
110
123
  # Decode binary strings to utf-8
111
124
  if ParquetParser._is_binary(parquet_value.type):
112
- py_value = parquet_value.as_py()
113
- if py_value is None:
114
- return py_value
115
- return py_value.decode("utf-8")
125
+ return parquet_value.as_py().decode("utf-8")
126
+
116
127
  if pa.types.is_decimal(parquet_value.type):
117
128
  if parquet_format.decimal_as_float:
118
129
  return parquet_value.as_py()
119
130
  else:
120
131
  return str(parquet_value.as_py())
121
132
 
122
- # Dictionaries are stored as two columns: indices and values
123
- # The indices column is an array of integers that maps to the values column
124
- if pa.types.is_dictionary(parquet_value.type):
125
- return {
126
- "indices": parquet_value.indices.tolist(),
127
- "values": parquet_value.dictionary.tolist(),
128
- }
129
133
  if pa.types.is_map(parquet_value.type):
130
134
  return {k: v for k, v in parquet_value.as_py()}
131
135
 
@@ -149,6 +153,20 @@ class ParquetParser(FileTypeParser):
149
153
  else:
150
154
  return parquet_value.as_py()
151
155
 
156
+ @staticmethod
157
+ def _dictionary_array_to_python_value(parquet_value: DictionaryArray) -> Dict[str, Any]:
158
+ """
159
+ Convert a pyarrow dictionary array to a value that can be output by the source.
160
+
161
+ Dictionaries are stored as two columns: indices and values
162
+ The indices column is an array of integers that maps to the values column
163
+ """
164
+
165
+ return {
166
+ "indices": parquet_value.indices.tolist(),
167
+ "values": parquet_value.dictionary.tolist(),
168
+ }
169
+
152
170
  @staticmethod
153
171
  def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
154
172
  """
@@ -7,7 +7,7 @@ import logging
7
7
  from functools import lru_cache
8
8
  from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
9
9
 
10
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
10
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteStream, Level, SyncMode, Type
11
11
  from airbyte_cdk.sources import AbstractSource
12
12
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
13
13
  from airbyte_cdk.sources.file_based.availability_strategy import (
@@ -156,29 +156,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
156
156
  def get_underlying_stream(self) -> DefaultStream:
157
157
  return self._abstract_stream
158
158
 
159
- def read_full_refresh(
159
+ def read(
160
160
  self,
161
- cursor_field: Optional[List[str]],
162
- logger: logging.Logger,
163
- slice_logger: SliceLogger,
164
- ) -> Iterable[StreamData]:
165
- """
166
- Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
167
- :param cursor_field: (ignored)
168
- :param logger: (ignored)
169
- :param slice_logger: (ignored)
170
- :return: Iterable of StreamData
171
- """
172
- yield from self._read_records()
173
-
174
- def read_incremental(
175
- self,
176
- cursor_field: Optional[List[str]],
161
+ configured_stream: ConfiguredAirbyteStream,
177
162
  logger: logging.Logger,
178
163
  slice_logger: SliceLogger,
179
164
  stream_state: MutableMapping[str, Any],
180
165
  state_manager: ConnectorStateManager,
181
- per_stream_state_enabled: bool,
182
166
  internal_config: InternalConfig,
183
167
  ) -> Iterable[StreamData]:
184
168
  yield from self._read_records()
@@ -155,9 +155,7 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
155
155
  self._stream_namespace,
156
156
  new_state,
157
157
  )
158
- state_message = self._connector_state_manager.create_state_message(
159
- self._stream_name, self._stream_namespace, send_per_stream_state=True
160
- )
158
+ state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
161
159
  self._message_repository.emit_message(state_message)
162
160
 
163
161
  def _get_new_cursor_value(self) -> str:
@@ -3,6 +3,6 @@
3
3
  #
4
4
 
5
5
  # Initialize Streams Package
6
- from .core import IncrementalMixin, Stream
6
+ from .core import FULL_REFRESH_SENTINEL_STATE_KEY, IncrementalMixin, Stream
7
7
 
8
- __all__ = ["IncrementalMixin", "Stream"]
8
+ __all__ = ["FULL_REFRESH_SENTINEL_STATE_KEY", "IncrementalMixin", "Stream"]
@@ -8,7 +8,7 @@ import logging
8
8
  from functools import lru_cache
9
9
  from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
10
10
 
11
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, Level, SyncMode, Type
11
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, Level, SyncMode, Type
12
12
  from airbyte_cdk.sources import AbstractSource, Source
13
13
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
14
  from airbyte_cdk.sources.message import MessageRepository
@@ -116,29 +116,13 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
116
116
  self._slice_logger = slice_logger
117
117
  self._logger = logger
118
118
 
119
- def read_full_refresh(
119
+ def read(
120
120
  self,
121
- cursor_field: Optional[List[str]],
122
- logger: logging.Logger,
123
- slice_logger: SliceLogger,
124
- ) -> Iterable[StreamData]:
125
- """
126
- Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
127
- :param cursor_field: (ignored)
128
- :param logger: (ignored)
129
- :param slice_logger: (ignored)
130
- :return: Iterable of StreamData
131
- """
132
- yield from self._read_records()
133
-
134
- def read_incremental(
135
- self,
136
- cursor_field: Optional[List[str]],
121
+ configured_stream: ConfiguredAirbyteStream,
137
122
  logger: logging.Logger,
138
123
  slice_logger: SliceLogger,
139
124
  stream_state: MutableMapping[str, Any],
140
125
  state_manager: ConnectorStateManager,
141
- per_stream_state_enabled: bool,
142
126
  internal_config: InternalConfig,
143
127
  ) -> Iterable[StreamData]:
144
128
  yield from self._read_records()
@@ -184,9 +184,7 @@ class ConcurrentCursor(Cursor):
184
184
  # TODO: if we migrate stored state to the concurrent state format
185
185
  # (aka stop calling self._connector_state_converter.convert_to_sequential_state`), we'll need to cast datetimes to string or
186
186
  # int before emitting state
187
- state_message = self._connector_state_manager.create_state_message(
188
- self._stream_name, self._stream_namespace, send_per_stream_state=True
189
- )
187
+ state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
190
188
  self._message_repository.emit_message(state_message)
191
189
 
192
190
  def _merge_partitions(self) -> None:
@@ -11,7 +11,7 @@ from functools import lru_cache
11
11
  from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
12
12
 
13
13
  import airbyte_cdk.sources.utils.casing as casing
14
- from airbyte_cdk.models import AirbyteMessage, AirbyteStream, SyncMode
14
+ from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, SyncMode
15
15
  from airbyte_cdk.models import Type as MessageType
16
16
 
17
17
  # list of all possible HTTP methods which can be used for sending of request bodies
@@ -31,6 +31,10 @@ StreamData = Union[Mapping[str, Any], AirbyteMessage]
31
31
 
32
32
  JsonSchema = Mapping[str, Any]
33
33
 
34
+ # Streams that only support full refresh don't have a suitable cursor so this sentinel
35
+ # value is used to indicate that stream should not load the incoming state value
36
+ FULL_REFRESH_SENTINEL_STATE_KEY = "__ab_full_refresh_state_message"
37
+
34
38
 
35
39
  def package_name_from_class(cls: object) -> str:
36
40
  """Find the package name given a class name"""
@@ -107,39 +111,24 @@ class Stream(ABC):
107
111
  """
108
112
  return None
109
113
 
110
- def read_full_refresh(
111
- self,
112
- cursor_field: Optional[List[str]],
113
- logger: logging.Logger,
114
- slice_logger: SliceLogger,
115
- ) -> Iterable[StreamData]:
116
- slices = self.stream_slices(sync_mode=SyncMode.full_refresh, cursor_field=cursor_field)
117
- logger.debug(f"Processing stream slices for {self.name} (sync_mode: full_refresh)", extra={"stream_slices": slices})
118
- for _slice in slices:
119
- if slice_logger.should_log_slice_message(logger):
120
- yield slice_logger.create_slice_log_message(_slice)
121
- yield from self.read_records(
122
- stream_slice=_slice,
123
- sync_mode=SyncMode.full_refresh,
124
- cursor_field=cursor_field,
125
- )
126
-
127
- def read_incremental( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
114
+ def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
128
115
  self,
129
- cursor_field: Optional[List[str]],
116
+ configured_stream: ConfiguredAirbyteStream,
130
117
  logger: logging.Logger,
131
118
  slice_logger: SliceLogger,
132
119
  stream_state: MutableMapping[str, Any],
133
120
  state_manager,
134
- per_stream_state_enabled: bool,
135
121
  internal_config: InternalConfig,
136
122
  ) -> Iterable[StreamData]:
123
+ sync_mode = configured_stream.sync_mode
124
+ cursor_field = configured_stream.cursor_field
125
+
137
126
  slices = self.stream_slices(
138
127
  cursor_field=cursor_field,
139
- sync_mode=SyncMode.incremental,
128
+ sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
140
129
  stream_state=stream_state,
141
130
  )
142
- logger.debug(f"Processing stream slices for {self.name} (sync_mode: incremental)", extra={"stream_slices": slices})
131
+ logger.debug(f"Processing stream slices for {self.name} (sync_mode: {sync_mode.name})", extra={"stream_slices": slices})
143
132
 
144
133
  has_slices = False
145
134
  record_counter = 0
@@ -148,7 +137,7 @@ class Stream(ABC):
148
137
  if slice_logger.should_log_slice_message(logger):
149
138
  yield slice_logger.create_slice_log_message(_slice)
150
139
  records = self.read_records(
151
- sync_mode=SyncMode.incremental,
140
+ sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
152
141
  stream_slice=_slice,
153
142
  stream_state=stream_state,
154
143
  cursor_field=cursor_field or None,
@@ -160,20 +149,34 @@ class Stream(ABC):
160
149
  ):
161
150
  record_data = record_data_or_message if isinstance(record_data_or_message, Mapping) else record_data_or_message.record
162
151
  stream_state = self.get_updated_state(stream_state, record_data)
163
- checkpoint_interval = self.state_checkpoint_interval
164
152
  record_counter += 1
165
- if checkpoint_interval and record_counter % checkpoint_interval == 0:
166
- yield self._checkpoint_state(stream_state, state_manager, per_stream_state_enabled)
153
+
154
+ if sync_mode == SyncMode.incremental:
155
+ # Checkpoint intervals are a bit controversial, but see below comment about why we're gating it right now
156
+ checkpoint_interval = self.state_checkpoint_interval
157
+ if checkpoint_interval and record_counter % checkpoint_interval == 0:
158
+ airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
159
+ yield airbyte_state_message
167
160
 
168
161
  if internal_config.is_limit_reached(record_counter):
169
162
  break
170
163
 
171
- yield self._checkpoint_state(stream_state, state_manager, per_stream_state_enabled)
164
+ if sync_mode == SyncMode.incremental:
165
+ # Even though right now, only incremental streams running as incremental mode will emit periodic checkpoints. Rather than
166
+ # overhaul how refresh interacts with the platform, this positions the code so that once we want to start emitting
167
+ # periodic checkpoints in full refresh mode it can be done here
168
+ airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
169
+ yield airbyte_state_message
170
+
171
+ if not has_slices or sync_mode == SyncMode.full_refresh:
172
+ if sync_mode == SyncMode.full_refresh:
173
+ # We use a dummy state if there is no suitable value provided by full_refresh streams that do not have a valid cursor.
174
+ # Incremental streams running full_refresh mode emit a meaningful state
175
+ stream_state = stream_state or {FULL_REFRESH_SENTINEL_STATE_KEY: True}
172
176
 
173
- if not has_slices:
174
- # Safety net to ensure we always emit at least one state message even if there are no slices
175
- checkpoint = self._checkpoint_state(stream_state, state_manager, per_stream_state_enabled)
176
- yield checkpoint
177
+ # We should always emit a final state message for full refresh sync or streams that do not have any slices
178
+ airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
179
+ yield airbyte_state_message
177
180
 
178
181
  @abstractmethod
179
182
  def read_records(
@@ -361,7 +364,6 @@ class Stream(ABC):
361
364
  self,
362
365
  stream_state: Mapping[str, Any],
363
366
  state_manager,
364
- per_stream_state_enabled: bool,
365
367
  ) -> AirbyteMessage:
366
368
  # First attempt to retrieve the current state using the stream's state property. We receive an AttributeError if the state
367
369
  # property is not implemented by the stream instance and as a fallback, use the stream_state retrieved from the stream
@@ -373,4 +375,4 @@ class Stream(ABC):
373
375
 
374
376
  except AttributeError:
375
377
  state_manager.update_state_for_stream(self.name, self.namespace, stream_state)
376
- return state_manager.create_state_message(self.name, self.namespace, send_per_stream_state=per_stream_state_enabled)
378
+ return state_manager.create_state_message(self.name, self.namespace)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.67.1
3
+ Version: 0.67.2
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte