airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  4. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  5. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  6. airbyte_cdk/sources/streams/__init__.py +2 -2
  7. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  8. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  9. airbyte_cdk/sources/streams/core.py +36 -34
  10. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/METADATA +3 -3
  11. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/RECORD +28 -28
  12. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  13. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  14. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  15. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  16. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  17. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  18. unit_tests/sources/file_based/test_scenarios.py +2 -2
  19. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  20. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  21. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  22. unit_tests/sources/streams/test_stream_read.py +221 -11
  23. unit_tests/sources/test_abstract_source.py +142 -130
  24. unit_tests/sources/test_connector_state_manager.py +3 -124
  25. unit_tests/sources/test_source.py +18 -14
  26. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/LICENSE.txt +0 -0
  27. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/WHEEL +0 -0
  28. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
23
23
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
24
24
  from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
25
25
  from airbyte_cdk.sources.source import Source
26
- from airbyte_cdk.sources.streams import Stream
26
+ from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
27
27
  from airbyte_cdk.sources.streams.core import StreamData
28
28
  from airbyte_cdk.sources.streams.http.http import HttpStream
29
29
  from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
181
181
  def raise_exception_on_missing_stream(self) -> bool:
182
182
  return True
183
183
 
184
- @property
185
- def per_stream_state_enabled(self) -> bool:
186
- return True
187
-
188
184
  def _read_stream(
189
185
  self,
190
186
  logger: logging.Logger,
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
206
202
  )
207
203
  stream_instance.log_stream_sync_configuration()
208
204
 
209
- use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
210
- if use_incremental:
211
- record_iterator = self._read_incremental(
212
- logger,
213
- stream_instance,
214
- configured_stream,
215
- state_manager,
216
- internal_config,
217
- )
218
- else:
219
- record_iterator = self._read_full_refresh(logger, stream_instance, configured_stream, internal_config)
205
+ stream_name = configured_stream.stream.name
206
+ # The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
207
+ # connector, but right now we need to prevent accidental usage of the previous stream state
208
+ stream_state = (
209
+ state_manager.get_stream_state(stream_name, stream_instance.namespace)
210
+ if configured_stream.sync_mode == SyncMode.incremental
211
+ else {}
212
+ )
213
+
214
+ if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
215
+ stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
216
+ logger.info(f"Setting state of {self.name} stream to {stream_state}")
217
+
218
+ record_iterator = stream_instance.read(
219
+ configured_stream,
220
+ logger,
221
+ self._slice_logger,
222
+ stream_state,
223
+ state_manager,
224
+ internal_config,
225
+ )
220
226
 
221
227
  record_counter = 0
222
- stream_name = configured_stream.stream.name
223
228
  logger.info(f"Syncing stream: {stream_name} ")
224
- for record in record_iterator:
229
+ for record_data_or_message in record_iterator:
230
+ record = self._get_message(record_data_or_message, stream_instance)
225
231
  if record.type == MessageType.RECORD:
226
232
  record_counter += 1
227
233
  if record_counter == 1:
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
233
239
 
234
240
  logger.info(f"Read {record_counter} records from {stream_name} stream")
235
241
 
236
- def _read_incremental(
237
- self,
238
- logger: logging.Logger,
239
- stream_instance: Stream,
240
- configured_stream: ConfiguredAirbyteStream,
241
- state_manager: ConnectorStateManager,
242
- internal_config: InternalConfig,
243
- ) -> Iterator[AirbyteMessage]:
244
- """Read stream using incremental algorithm
245
-
246
- :param logger:
247
- :param stream_instance:
248
- :param configured_stream:
249
- :param state_manager:
250
- :param internal_config:
251
- :return:
252
- """
253
- stream_name = configured_stream.stream.name
254
- stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
255
-
256
- if stream_state and "state" in dir(stream_instance):
257
- stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
258
- logger.info(f"Setting state of {self.name} stream to {stream_state}")
259
-
260
- for record_data_or_message in stream_instance.read_incremental(
261
- configured_stream.cursor_field,
262
- logger,
263
- self._slice_logger,
264
- stream_state,
265
- state_manager,
266
- self.per_stream_state_enabled,
267
- internal_config,
268
- ):
269
- yield self._get_message(record_data_or_message, stream_instance)
270
-
271
242
  def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
272
243
  if self.message_repository:
273
244
  yield from self.message_repository.consume_queue()
274
245
  return
275
246
 
276
- def _read_full_refresh(
277
- self,
278
- logger: logging.Logger,
279
- stream_instance: Stream,
280
- configured_stream: ConfiguredAirbyteStream,
281
- internal_config: InternalConfig,
282
- ) -> Iterator[AirbyteMessage]:
283
- total_records_counter = 0
284
- for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
285
- message = self._get_message(record_data_or_message, stream_instance)
286
- yield message
287
- if message.type == MessageType.RECORD:
288
- total_records_counter += 1
289
- if internal_config.is_limit_reached(total_records_counter):
290
- return
291
-
292
247
  def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
293
248
  """
294
249
  Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
317
272
  def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
318
273
  failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
319
274
  return f"During the sync, the following streams did not sync successfully: {failures}"
275
+
276
+ @staticmethod
277
+ def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
278
+ # For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
279
+ # This key is never used by a connector and is needed during a read to skip assigning the incoming state.
280
+ return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
@@ -77,7 +77,7 @@ class ConnectorStateManager:
77
77
  stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
78
78
  self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
79
79
 
80
- def create_state_message(self, stream_name: str, namespace: Optional[str], send_per_stream_state: bool) -> AirbyteMessage:
80
+ def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
81
81
  """
82
82
  Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
83
83
  :param stream_name: The name of the stream for the message that is being created
@@ -85,25 +85,18 @@ class ConnectorStateManager:
85
85
  :param send_per_stream_state: Decides which state format the message should be generated as
86
86
  :return: The Airbyte state message to be emitted by the connector during a sync
87
87
  """
88
- if send_per_stream_state:
89
- hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
90
- stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
91
-
92
- # According to the Airbyte protocol, the StreamDescriptor namespace field is not required. However, the platform will throw
93
- # a validation error if it receives namespace=null. That is why if namespace is None, the field should be omitted instead.
94
- stream_descriptor = (
95
- StreamDescriptor(name=stream_name) if namespace is None else StreamDescriptor(name=stream_name, namespace=namespace)
96
- )
97
-
98
- return AirbyteMessage(
99
- type=MessageType.STATE,
100
- state=AirbyteStateMessage(
101
- type=AirbyteStateType.STREAM,
102
- stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
103
- data=dict(self._get_legacy_state()),
88
+ hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
89
+ stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
90
+
91
+ return AirbyteMessage(
92
+ type=MessageType.STATE,
93
+ state=AirbyteStateMessage(
94
+ type=AirbyteStateType.STREAM,
95
+ stream=AirbyteStreamState(
96
+ stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
104
97
  ),
105
- )
106
- return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=dict(self._get_legacy_state())))
98
+ ),
99
+ )
107
100
 
108
101
  @classmethod
109
102
  def _extract_from_state_message(
@@ -176,13 +169,6 @@ class ConnectorStateManager:
176
169
  streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
177
170
  return streams
178
171
 
179
- def _get_legacy_state(self) -> Mapping[str, Any]:
180
- """
181
- Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
182
- :return: A deep copy of the mapping of stream name to stream state value
183
- """
184
- return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
185
-
186
172
  @staticmethod
187
173
  def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
188
174
  return isinstance(state, dict)
@@ -5,7 +5,7 @@
5
5
  import json
6
6
  import logging
7
7
  import os
8
- from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
8
+ from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
9
9
  from urllib.parse import unquote
10
10
 
11
11
  import pyarrow as pa
@@ -16,7 +16,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
16
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
17
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
18
18
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
19
- from pyarrow import Scalar
19
+ from pyarrow import DictionaryArray, Scalar
20
20
 
21
21
 
22
22
  class ParquetParser(FileTypeParser):
@@ -95,10 +95,23 @@ class ParquetParser(FileTypeParser):
95
95
  return FileReadMode.READ_BINARY
96
96
 
97
97
  @staticmethod
98
- def _to_output_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
98
+ def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
99
+ """
100
+ Convert an entry in a pyarrow table to a value that can be output by the source.
101
+ """
102
+ if isinstance(parquet_value, DictionaryArray):
103
+ return ParquetParser._dictionary_array_to_python_value(parquet_value)
104
+ else:
105
+ return ParquetParser._scalar_to_python_value(parquet_value, parquet_format)
106
+
107
+ @staticmethod
108
+ def _scalar_to_python_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
99
109
  """
100
110
  Convert a pyarrow scalar to a value that can be output by the source.
101
111
  """
112
+ if parquet_value.as_py() is None:
113
+ return None
114
+
102
115
  # Convert date and datetime objects to isoformat strings
103
116
  if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
104
117
  return parquet_value.as_py().isoformat()
@@ -109,23 +122,14 @@ class ParquetParser(FileTypeParser):
109
122
 
110
123
  # Decode binary strings to utf-8
111
124
  if ParquetParser._is_binary(parquet_value.type):
112
- py_value = parquet_value.as_py()
113
- if py_value is None:
114
- return py_value
115
- return py_value.decode("utf-8")
125
+ return parquet_value.as_py().decode("utf-8")
126
+
116
127
  if pa.types.is_decimal(parquet_value.type):
117
128
  if parquet_format.decimal_as_float:
118
129
  return parquet_value.as_py()
119
130
  else:
120
131
  return str(parquet_value.as_py())
121
132
 
122
- # Dictionaries are stored as two columns: indices and values
123
- # The indices column is an array of integers that maps to the values column
124
- if pa.types.is_dictionary(parquet_value.type):
125
- return {
126
- "indices": parquet_value.indices.tolist(),
127
- "values": parquet_value.dictionary.tolist(),
128
- }
129
133
  if pa.types.is_map(parquet_value.type):
130
134
  return {k: v for k, v in parquet_value.as_py()}
131
135
 
@@ -149,6 +153,20 @@ class ParquetParser(FileTypeParser):
149
153
  else:
150
154
  return parquet_value.as_py()
151
155
 
156
+ @staticmethod
157
+ def _dictionary_array_to_python_value(parquet_value: DictionaryArray) -> Dict[str, Any]:
158
+ """
159
+ Convert a pyarrow dictionary array to a value that can be output by the source.
160
+
161
+ Dictionaries are stored as two columns: indices and values
162
+ The indices column is an array of integers that maps to the values column
163
+ """
164
+
165
+ return {
166
+ "indices": parquet_value.indices.tolist(),
167
+ "values": parquet_value.dictionary.tolist(),
168
+ }
169
+
152
170
  @staticmethod
153
171
  def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
154
172
  """
@@ -7,7 +7,7 @@ import logging
7
7
  from functools import lru_cache
8
8
  from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
9
9
 
10
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
10
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteStream, Level, SyncMode, Type
11
11
  from airbyte_cdk.sources import AbstractSource
12
12
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
13
13
  from airbyte_cdk.sources.file_based.availability_strategy import (
@@ -156,29 +156,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
156
156
  def get_underlying_stream(self) -> DefaultStream:
157
157
  return self._abstract_stream
158
158
 
159
- def read_full_refresh(
159
+ def read(
160
160
  self,
161
- cursor_field: Optional[List[str]],
162
- logger: logging.Logger,
163
- slice_logger: SliceLogger,
164
- ) -> Iterable[StreamData]:
165
- """
166
- Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
167
- :param cursor_field: (ignored)
168
- :param logger: (ignored)
169
- :param slice_logger: (ignored)
170
- :return: Iterable of StreamData
171
- """
172
- yield from self._read_records()
173
-
174
- def read_incremental(
175
- self,
176
- cursor_field: Optional[List[str]],
161
+ configured_stream: ConfiguredAirbyteStream,
177
162
  logger: logging.Logger,
178
163
  slice_logger: SliceLogger,
179
164
  stream_state: MutableMapping[str, Any],
180
165
  state_manager: ConnectorStateManager,
181
- per_stream_state_enabled: bool,
182
166
  internal_config: InternalConfig,
183
167
  ) -> Iterable[StreamData]:
184
168
  yield from self._read_records()
@@ -155,9 +155,7 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
155
155
  self._stream_namespace,
156
156
  new_state,
157
157
  )
158
- state_message = self._connector_state_manager.create_state_message(
159
- self._stream_name, self._stream_namespace, send_per_stream_state=True
160
- )
158
+ state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
161
159
  self._message_repository.emit_message(state_message)
162
160
 
163
161
  def _get_new_cursor_value(self) -> str:
@@ -3,6 +3,6 @@
3
3
  #
4
4
 
5
5
  # Initialize Streams Package
6
- from .core import IncrementalMixin, Stream
6
+ from .core import FULL_REFRESH_SENTINEL_STATE_KEY, IncrementalMixin, Stream
7
7
 
8
- __all__ = ["IncrementalMixin", "Stream"]
8
+ __all__ = ["FULL_REFRESH_SENTINEL_STATE_KEY", "IncrementalMixin", "Stream"]
@@ -8,7 +8,7 @@ import logging
8
8
  from functools import lru_cache
9
9
  from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
10
10
 
11
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, Level, SyncMode, Type
11
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, Level, SyncMode, Type
12
12
  from airbyte_cdk.sources import AbstractSource, Source
13
13
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
14
  from airbyte_cdk.sources.message import MessageRepository
@@ -116,29 +116,13 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
116
116
  self._slice_logger = slice_logger
117
117
  self._logger = logger
118
118
 
119
- def read_full_refresh(
119
+ def read(
120
120
  self,
121
- cursor_field: Optional[List[str]],
122
- logger: logging.Logger,
123
- slice_logger: SliceLogger,
124
- ) -> Iterable[StreamData]:
125
- """
126
- Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
127
- :param cursor_field: (ignored)
128
- :param logger: (ignored)
129
- :param slice_logger: (ignored)
130
- :return: Iterable of StreamData
131
- """
132
- yield from self._read_records()
133
-
134
- def read_incremental(
135
- self,
136
- cursor_field: Optional[List[str]],
121
+ configured_stream: ConfiguredAirbyteStream,
137
122
  logger: logging.Logger,
138
123
  slice_logger: SliceLogger,
139
124
  stream_state: MutableMapping[str, Any],
140
125
  state_manager: ConnectorStateManager,
141
- per_stream_state_enabled: bool,
142
126
  internal_config: InternalConfig,
143
127
  ) -> Iterable[StreamData]:
144
128
  yield from self._read_records()
@@ -184,9 +184,7 @@ class ConcurrentCursor(Cursor):
184
184
  # TODO: if we migrate stored state to the concurrent state format
185
185
  # (aka stop calling self._connector_state_converter.convert_to_sequential_state`), we'll need to cast datetimes to string or
186
186
  # int before emitting state
187
- state_message = self._connector_state_manager.create_state_message(
188
- self._stream_name, self._stream_namespace, send_per_stream_state=True
189
- )
187
+ state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
190
188
  self._message_repository.emit_message(state_message)
191
189
 
192
190
  def _merge_partitions(self) -> None:
@@ -11,7 +11,7 @@ from functools import lru_cache
11
11
  from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
12
12
 
13
13
  import airbyte_cdk.sources.utils.casing as casing
14
- from airbyte_cdk.models import AirbyteMessage, AirbyteStream, SyncMode
14
+ from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, SyncMode
15
15
  from airbyte_cdk.models import Type as MessageType
16
16
 
17
17
  # list of all possible HTTP methods which can be used for sending of request bodies
@@ -31,6 +31,10 @@ StreamData = Union[Mapping[str, Any], AirbyteMessage]
31
31
 
32
32
  JsonSchema = Mapping[str, Any]
33
33
 
34
+ # Streams that only support full refresh don't have a suitable cursor so this sentinel
35
+ # value is used to indicate that stream should not load the incoming state value
36
+ FULL_REFRESH_SENTINEL_STATE_KEY = "__ab_full_refresh_state_message"
37
+
34
38
 
35
39
  def package_name_from_class(cls: object) -> str:
36
40
  """Find the package name given a class name"""
@@ -107,39 +111,24 @@ class Stream(ABC):
107
111
  """
108
112
  return None
109
113
 
110
- def read_full_refresh(
111
- self,
112
- cursor_field: Optional[List[str]],
113
- logger: logging.Logger,
114
- slice_logger: SliceLogger,
115
- ) -> Iterable[StreamData]:
116
- slices = self.stream_slices(sync_mode=SyncMode.full_refresh, cursor_field=cursor_field)
117
- logger.debug(f"Processing stream slices for {self.name} (sync_mode: full_refresh)", extra={"stream_slices": slices})
118
- for _slice in slices:
119
- if slice_logger.should_log_slice_message(logger):
120
- yield slice_logger.create_slice_log_message(_slice)
121
- yield from self.read_records(
122
- stream_slice=_slice,
123
- sync_mode=SyncMode.full_refresh,
124
- cursor_field=cursor_field,
125
- )
126
-
127
- def read_incremental( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
114
+ def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
128
115
  self,
129
- cursor_field: Optional[List[str]],
116
+ configured_stream: ConfiguredAirbyteStream,
130
117
  logger: logging.Logger,
131
118
  slice_logger: SliceLogger,
132
119
  stream_state: MutableMapping[str, Any],
133
120
  state_manager,
134
- per_stream_state_enabled: bool,
135
121
  internal_config: InternalConfig,
136
122
  ) -> Iterable[StreamData]:
123
+ sync_mode = configured_stream.sync_mode
124
+ cursor_field = configured_stream.cursor_field
125
+
137
126
  slices = self.stream_slices(
138
127
  cursor_field=cursor_field,
139
- sync_mode=SyncMode.incremental,
128
+ sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
140
129
  stream_state=stream_state,
141
130
  )
142
- logger.debug(f"Processing stream slices for {self.name} (sync_mode: incremental)", extra={"stream_slices": slices})
131
+ logger.debug(f"Processing stream slices for {self.name} (sync_mode: {sync_mode.name})", extra={"stream_slices": slices})
143
132
 
144
133
  has_slices = False
145
134
  record_counter = 0
@@ -148,7 +137,7 @@ class Stream(ABC):
148
137
  if slice_logger.should_log_slice_message(logger):
149
138
  yield slice_logger.create_slice_log_message(_slice)
150
139
  records = self.read_records(
151
- sync_mode=SyncMode.incremental,
140
+ sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
152
141
  stream_slice=_slice,
153
142
  stream_state=stream_state,
154
143
  cursor_field=cursor_field or None,
@@ -160,20 +149,34 @@ class Stream(ABC):
160
149
  ):
161
150
  record_data = record_data_or_message if isinstance(record_data_or_message, Mapping) else record_data_or_message.record
162
151
  stream_state = self.get_updated_state(stream_state, record_data)
163
- checkpoint_interval = self.state_checkpoint_interval
164
152
  record_counter += 1
165
- if checkpoint_interval and record_counter % checkpoint_interval == 0:
166
- yield self._checkpoint_state(stream_state, state_manager, per_stream_state_enabled)
153
+
154
+ if sync_mode == SyncMode.incremental:
155
+ # Checkpoint intervals are a bit controversial, but see below comment about why we're gating it right now
156
+ checkpoint_interval = self.state_checkpoint_interval
157
+ if checkpoint_interval and record_counter % checkpoint_interval == 0:
158
+ airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
159
+ yield airbyte_state_message
167
160
 
168
161
  if internal_config.is_limit_reached(record_counter):
169
162
  break
170
163
 
171
- yield self._checkpoint_state(stream_state, state_manager, per_stream_state_enabled)
164
+ if sync_mode == SyncMode.incremental:
165
+ # Even though right now, only incremental streams running as incremental mode will emit periodic checkpoints. Rather than
166
+ # overhaul how refresh interacts with the platform, this positions the code so that once we want to start emitting
167
+ # periodic checkpoints in full refresh mode it can be done here
168
+ airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
169
+ yield airbyte_state_message
170
+
171
+ if not has_slices or sync_mode == SyncMode.full_refresh:
172
+ if sync_mode == SyncMode.full_refresh:
173
+ # We use a dummy state if there is no suitable value provided by full_refresh streams that do not have a valid cursor.
174
+ # Incremental streams running full_refresh mode emit a meaningful state
175
+ stream_state = stream_state or {FULL_REFRESH_SENTINEL_STATE_KEY: True}
172
176
 
173
- if not has_slices:
174
- # Safety net to ensure we always emit at least one state message even if there are no slices
175
- checkpoint = self._checkpoint_state(stream_state, state_manager, per_stream_state_enabled)
176
- yield checkpoint
177
+ # We should always emit a final state message for full refresh sync or streams that do not have any slices
178
+ airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
179
+ yield airbyte_state_message
177
180
 
178
181
  @abstractmethod
179
182
  def read_records(
@@ -361,7 +364,6 @@ class Stream(ABC):
361
364
  self,
362
365
  stream_state: Mapping[str, Any],
363
366
  state_manager,
364
- per_stream_state_enabled: bool,
365
367
  ) -> AirbyteMessage:
366
368
  # First attempt to retrieve the current state using the stream's state property. We receive an AttributeError if the state
367
369
  # property is not implemented by the stream instance and as a fallback, use the stream_state retrieved from the stream
@@ -373,4 +375,4 @@ class Stream(ABC):
373
375
 
374
376
  except AttributeError:
375
377
  state_manager.update_state_for_stream(self.name, self.namespace, stream_state)
376
- return state_manager.create_state_message(self.name, self.namespace, send_per_stream_state=per_stream_state_enabled)
378
+ return state_manager.create_state_message(self.name, self.namespace)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.67.1
3
+ Version: 0.67.3
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -48,7 +48,7 @@ Requires-Dist: pytest-mock ; extra == 'dev'
48
48
  Requires-Dist: requests-mock ; extra == 'dev'
49
49
  Requires-Dist: pytest-httpserver ; extra == 'dev'
50
50
  Requires-Dist: pandas ==2.0.3 ; extra == 'dev'
51
- Requires-Dist: pyarrow ==12.0.1 ; extra == 'dev'
51
+ Requires-Dist: pyarrow ~=15.0.0 ; extra == 'dev'
52
52
  Requires-Dist: langchain ==0.0.271 ; extra == 'dev'
53
53
  Requires-Dist: openai[embeddings] ==0.27.9 ; extra == 'dev'
54
54
  Requires-Dist: cohere ==4.21 ; extra == 'dev'
@@ -63,7 +63,7 @@ Requires-Dist: markdown ; extra == 'dev'
63
63
  Provides-Extra: file-based
64
64
  Requires-Dist: avro ~=1.11.2 ; extra == 'file-based'
65
65
  Requires-Dist: fastavro ~=1.8.0 ; extra == 'file-based'
66
- Requires-Dist: pyarrow ==12.0.1 ; extra == 'file-based'
66
+ Requires-Dist: pyarrow ~=15.0.0 ; extra == 'file-based'
67
67
  Requires-Dist: unstructured ==0.10.27 ; extra == 'file-based'
68
68
  Requires-Dist: unstructured[docx,pptx] ==0.10.27 ; extra == 'file-based'
69
69
  Requires-Dist: pdf2image ==1.16.3 ; extra == 'file-based'