airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +28 -28
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
|
|
23
23
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
24
24
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
25
25
|
from airbyte_cdk.sources.source import Source
|
26
|
-
from airbyte_cdk.sources.streams import Stream
|
26
|
+
from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
|
27
27
|
from airbyte_cdk.sources.streams.core import StreamData
|
28
28
|
from airbyte_cdk.sources.streams.http.http import HttpStream
|
29
29
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
|
|
181
181
|
def raise_exception_on_missing_stream(self) -> bool:
|
182
182
|
return True
|
183
183
|
|
184
|
-
@property
|
185
|
-
def per_stream_state_enabled(self) -> bool:
|
186
|
-
return True
|
187
|
-
|
188
184
|
def _read_stream(
|
189
185
|
self,
|
190
186
|
logger: logging.Logger,
|
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
|
|
206
202
|
)
|
207
203
|
stream_instance.log_stream_sync_configuration()
|
208
204
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
205
|
+
stream_name = configured_stream.stream.name
|
206
|
+
# The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
|
207
|
+
# connector, but right now we need to prevent accidental usage of the previous stream state
|
208
|
+
stream_state = (
|
209
|
+
state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
210
|
+
if configured_stream.sync_mode == SyncMode.incremental
|
211
|
+
else {}
|
212
|
+
)
|
213
|
+
|
214
|
+
if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
|
215
|
+
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
216
|
+
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
217
|
+
|
218
|
+
record_iterator = stream_instance.read(
|
219
|
+
configured_stream,
|
220
|
+
logger,
|
221
|
+
self._slice_logger,
|
222
|
+
stream_state,
|
223
|
+
state_manager,
|
224
|
+
internal_config,
|
225
|
+
)
|
220
226
|
|
221
227
|
record_counter = 0
|
222
|
-
stream_name = configured_stream.stream.name
|
223
228
|
logger.info(f"Syncing stream: {stream_name} ")
|
224
|
-
for
|
229
|
+
for record_data_or_message in record_iterator:
|
230
|
+
record = self._get_message(record_data_or_message, stream_instance)
|
225
231
|
if record.type == MessageType.RECORD:
|
226
232
|
record_counter += 1
|
227
233
|
if record_counter == 1:
|
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
|
|
233
239
|
|
234
240
|
logger.info(f"Read {record_counter} records from {stream_name} stream")
|
235
241
|
|
236
|
-
def _read_incremental(
|
237
|
-
self,
|
238
|
-
logger: logging.Logger,
|
239
|
-
stream_instance: Stream,
|
240
|
-
configured_stream: ConfiguredAirbyteStream,
|
241
|
-
state_manager: ConnectorStateManager,
|
242
|
-
internal_config: InternalConfig,
|
243
|
-
) -> Iterator[AirbyteMessage]:
|
244
|
-
"""Read stream using incremental algorithm
|
245
|
-
|
246
|
-
:param logger:
|
247
|
-
:param stream_instance:
|
248
|
-
:param configured_stream:
|
249
|
-
:param state_manager:
|
250
|
-
:param internal_config:
|
251
|
-
:return:
|
252
|
-
"""
|
253
|
-
stream_name = configured_stream.stream.name
|
254
|
-
stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
255
|
-
|
256
|
-
if stream_state and "state" in dir(stream_instance):
|
257
|
-
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
258
|
-
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
259
|
-
|
260
|
-
for record_data_or_message in stream_instance.read_incremental(
|
261
|
-
configured_stream.cursor_field,
|
262
|
-
logger,
|
263
|
-
self._slice_logger,
|
264
|
-
stream_state,
|
265
|
-
state_manager,
|
266
|
-
self.per_stream_state_enabled,
|
267
|
-
internal_config,
|
268
|
-
):
|
269
|
-
yield self._get_message(record_data_or_message, stream_instance)
|
270
|
-
|
271
242
|
def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
|
272
243
|
if self.message_repository:
|
273
244
|
yield from self.message_repository.consume_queue()
|
274
245
|
return
|
275
246
|
|
276
|
-
def _read_full_refresh(
|
277
|
-
self,
|
278
|
-
logger: logging.Logger,
|
279
|
-
stream_instance: Stream,
|
280
|
-
configured_stream: ConfiguredAirbyteStream,
|
281
|
-
internal_config: InternalConfig,
|
282
|
-
) -> Iterator[AirbyteMessage]:
|
283
|
-
total_records_counter = 0
|
284
|
-
for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
|
285
|
-
message = self._get_message(record_data_or_message, stream_instance)
|
286
|
-
yield message
|
287
|
-
if message.type == MessageType.RECORD:
|
288
|
-
total_records_counter += 1
|
289
|
-
if internal_config.is_limit_reached(total_records_counter):
|
290
|
-
return
|
291
|
-
|
292
247
|
def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
|
293
248
|
"""
|
294
249
|
Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
|
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
|
|
317
272
|
def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
|
318
273
|
failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
|
319
274
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
275
|
+
|
276
|
+
@staticmethod
|
277
|
+
def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
|
278
|
+
# For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
|
279
|
+
# This key is never used by a connector and is needed during a read to skip assigning the incoming state.
|
280
|
+
return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
|
@@ -77,7 +77,7 @@ class ConnectorStateManager:
|
|
77
77
|
stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
78
78
|
self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
|
79
79
|
|
80
|
-
def create_state_message(self, stream_name: str, namespace: Optional[str]
|
80
|
+
def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
|
81
81
|
"""
|
82
82
|
Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
|
83
83
|
:param stream_name: The name of the stream for the message that is being created
|
@@ -85,25 +85,18 @@ class ConnectorStateManager:
|
|
85
85
|
:param send_per_stream_state: Decides which state format the message should be generated as
|
86
86
|
:return: The Airbyte state message to be emitted by the connector during a sync
|
87
87
|
"""
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
return AirbyteMessage(
|
99
|
-
type=MessageType.STATE,
|
100
|
-
state=AirbyteStateMessage(
|
101
|
-
type=AirbyteStateType.STREAM,
|
102
|
-
stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
|
103
|
-
data=dict(self._get_legacy_state()),
|
88
|
+
hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
89
|
+
stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
|
90
|
+
|
91
|
+
return AirbyteMessage(
|
92
|
+
type=MessageType.STATE,
|
93
|
+
state=AirbyteStateMessage(
|
94
|
+
type=AirbyteStateType.STREAM,
|
95
|
+
stream=AirbyteStreamState(
|
96
|
+
stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
|
104
97
|
),
|
105
|
-
)
|
106
|
-
|
98
|
+
),
|
99
|
+
)
|
107
100
|
|
108
101
|
@classmethod
|
109
102
|
def _extract_from_state_message(
|
@@ -176,13 +169,6 @@ class ConnectorStateManager:
|
|
176
169
|
streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
|
177
170
|
return streams
|
178
171
|
|
179
|
-
def _get_legacy_state(self) -> Mapping[str, Any]:
|
180
|
-
"""
|
181
|
-
Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
|
182
|
-
:return: A deep copy of the mapping of stream name to stream state value
|
183
|
-
"""
|
184
|
-
return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
|
185
|
-
|
186
172
|
@staticmethod
|
187
173
|
def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
|
188
174
|
return isinstance(state, dict)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
7
|
import os
|
8
|
-
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
8
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
9
9
|
from urllib.parse import unquote
|
10
10
|
|
11
11
|
import pyarrow as pa
|
@@ -16,7 +16,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
|
|
16
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
18
18
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
19
|
-
from pyarrow import Scalar
|
19
|
+
from pyarrow import DictionaryArray, Scalar
|
20
20
|
|
21
21
|
|
22
22
|
class ParquetParser(FileTypeParser):
|
@@ -95,10 +95,23 @@ class ParquetParser(FileTypeParser):
|
|
95
95
|
return FileReadMode.READ_BINARY
|
96
96
|
|
97
97
|
@staticmethod
|
98
|
-
def _to_output_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
|
98
|
+
def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
|
99
|
+
"""
|
100
|
+
Convert an entry in a pyarrow table to a value that can be output by the source.
|
101
|
+
"""
|
102
|
+
if isinstance(parquet_value, DictionaryArray):
|
103
|
+
return ParquetParser._dictionary_array_to_python_value(parquet_value)
|
104
|
+
else:
|
105
|
+
return ParquetParser._scalar_to_python_value(parquet_value, parquet_format)
|
106
|
+
|
107
|
+
@staticmethod
|
108
|
+
def _scalar_to_python_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
|
99
109
|
"""
|
100
110
|
Convert a pyarrow scalar to a value that can be output by the source.
|
101
111
|
"""
|
112
|
+
if parquet_value.as_py() is None:
|
113
|
+
return None
|
114
|
+
|
102
115
|
# Convert date and datetime objects to isoformat strings
|
103
116
|
if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
|
104
117
|
return parquet_value.as_py().isoformat()
|
@@ -109,23 +122,14 @@ class ParquetParser(FileTypeParser):
|
|
109
122
|
|
110
123
|
# Decode binary strings to utf-8
|
111
124
|
if ParquetParser._is_binary(parquet_value.type):
|
112
|
-
|
113
|
-
|
114
|
-
return py_value
|
115
|
-
return py_value.decode("utf-8")
|
125
|
+
return parquet_value.as_py().decode("utf-8")
|
126
|
+
|
116
127
|
if pa.types.is_decimal(parquet_value.type):
|
117
128
|
if parquet_format.decimal_as_float:
|
118
129
|
return parquet_value.as_py()
|
119
130
|
else:
|
120
131
|
return str(parquet_value.as_py())
|
121
132
|
|
122
|
-
# Dictionaries are stored as two columns: indices and values
|
123
|
-
# The indices column is an array of integers that maps to the values column
|
124
|
-
if pa.types.is_dictionary(parquet_value.type):
|
125
|
-
return {
|
126
|
-
"indices": parquet_value.indices.tolist(),
|
127
|
-
"values": parquet_value.dictionary.tolist(),
|
128
|
-
}
|
129
133
|
if pa.types.is_map(parquet_value.type):
|
130
134
|
return {k: v for k, v in parquet_value.as_py()}
|
131
135
|
|
@@ -149,6 +153,20 @@ class ParquetParser(FileTypeParser):
|
|
149
153
|
else:
|
150
154
|
return parquet_value.as_py()
|
151
155
|
|
156
|
+
@staticmethod
|
157
|
+
def _dictionary_array_to_python_value(parquet_value: DictionaryArray) -> Dict[str, Any]:
|
158
|
+
"""
|
159
|
+
Convert a pyarrow dictionary array to a value that can be output by the source.
|
160
|
+
|
161
|
+
Dictionaries are stored as two columns: indices and values
|
162
|
+
The indices column is an array of integers that maps to the values column
|
163
|
+
"""
|
164
|
+
|
165
|
+
return {
|
166
|
+
"indices": parquet_value.indices.tolist(),
|
167
|
+
"values": parquet_value.dictionary.tolist(),
|
168
|
+
}
|
169
|
+
|
152
170
|
@staticmethod
|
153
171
|
def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
|
154
172
|
"""
|
@@ -7,7 +7,7 @@ import logging
|
|
7
7
|
from functools import lru_cache
|
8
8
|
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
|
-
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
|
10
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteStream, Level, SyncMode, Type
|
11
11
|
from airbyte_cdk.sources import AbstractSource
|
12
12
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
13
13
|
from airbyte_cdk.sources.file_based.availability_strategy import (
|
@@ -156,29 +156,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
156
156
|
def get_underlying_stream(self) -> DefaultStream:
|
157
157
|
return self._abstract_stream
|
158
158
|
|
159
|
-
def
|
159
|
+
def read(
|
160
160
|
self,
|
161
|
-
|
162
|
-
logger: logging.Logger,
|
163
|
-
slice_logger: SliceLogger,
|
164
|
-
) -> Iterable[StreamData]:
|
165
|
-
"""
|
166
|
-
Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
|
167
|
-
:param cursor_field: (ignored)
|
168
|
-
:param logger: (ignored)
|
169
|
-
:param slice_logger: (ignored)
|
170
|
-
:return: Iterable of StreamData
|
171
|
-
"""
|
172
|
-
yield from self._read_records()
|
173
|
-
|
174
|
-
def read_incremental(
|
175
|
-
self,
|
176
|
-
cursor_field: Optional[List[str]],
|
161
|
+
configured_stream: ConfiguredAirbyteStream,
|
177
162
|
logger: logging.Logger,
|
178
163
|
slice_logger: SliceLogger,
|
179
164
|
stream_state: MutableMapping[str, Any],
|
180
165
|
state_manager: ConnectorStateManager,
|
181
|
-
per_stream_state_enabled: bool,
|
182
166
|
internal_config: InternalConfig,
|
183
167
|
) -> Iterable[StreamData]:
|
184
168
|
yield from self._read_records()
|
@@ -155,9 +155,7 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
155
155
|
self._stream_namespace,
|
156
156
|
new_state,
|
157
157
|
)
|
158
|
-
state_message = self._connector_state_manager.create_state_message(
|
159
|
-
self._stream_name, self._stream_namespace, send_per_stream_state=True
|
160
|
-
)
|
158
|
+
state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
|
161
159
|
self._message_repository.emit_message(state_message)
|
162
160
|
|
163
161
|
def _get_new_cursor_value(self) -> str:
|
@@ -3,6 +3,6 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
# Initialize Streams Package
|
6
|
-
from .core import IncrementalMixin, Stream
|
6
|
+
from .core import FULL_REFRESH_SENTINEL_STATE_KEY, IncrementalMixin, Stream
|
7
7
|
|
8
|
-
__all__ = ["IncrementalMixin", "Stream"]
|
8
|
+
__all__ = ["FULL_REFRESH_SENTINEL_STATE_KEY", "IncrementalMixin", "Stream"]
|
@@ -8,7 +8,7 @@ import logging
|
|
8
8
|
from functools import lru_cache
|
9
9
|
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
|
10
10
|
|
11
|
-
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, Level, SyncMode, Type
|
11
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, Level, SyncMode, Type
|
12
12
|
from airbyte_cdk.sources import AbstractSource, Source
|
13
13
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
14
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -116,29 +116,13 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
116
116
|
self._slice_logger = slice_logger
|
117
117
|
self._logger = logger
|
118
118
|
|
119
|
-
def
|
119
|
+
def read(
|
120
120
|
self,
|
121
|
-
|
122
|
-
logger: logging.Logger,
|
123
|
-
slice_logger: SliceLogger,
|
124
|
-
) -> Iterable[StreamData]:
|
125
|
-
"""
|
126
|
-
Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
|
127
|
-
:param cursor_field: (ignored)
|
128
|
-
:param logger: (ignored)
|
129
|
-
:param slice_logger: (ignored)
|
130
|
-
:return: Iterable of StreamData
|
131
|
-
"""
|
132
|
-
yield from self._read_records()
|
133
|
-
|
134
|
-
def read_incremental(
|
135
|
-
self,
|
136
|
-
cursor_field: Optional[List[str]],
|
121
|
+
configured_stream: ConfiguredAirbyteStream,
|
137
122
|
logger: logging.Logger,
|
138
123
|
slice_logger: SliceLogger,
|
139
124
|
stream_state: MutableMapping[str, Any],
|
140
125
|
state_manager: ConnectorStateManager,
|
141
|
-
per_stream_state_enabled: bool,
|
142
126
|
internal_config: InternalConfig,
|
143
127
|
) -> Iterable[StreamData]:
|
144
128
|
yield from self._read_records()
|
@@ -184,9 +184,7 @@ class ConcurrentCursor(Cursor):
|
|
184
184
|
# TODO: if we migrate stored state to the concurrent state format
|
185
185
|
# (aka stop calling self._connector_state_converter.convert_to_sequential_state`), we'll need to cast datetimes to string or
|
186
186
|
# int before emitting state
|
187
|
-
state_message = self._connector_state_manager.create_state_message(
|
188
|
-
self._stream_name, self._stream_namespace, send_per_stream_state=True
|
189
|
-
)
|
187
|
+
state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
|
190
188
|
self._message_repository.emit_message(state_message)
|
191
189
|
|
192
190
|
def _merge_partitions(self) -> None:
|
@@ -11,7 +11,7 @@ from functools import lru_cache
|
|
11
11
|
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
|
12
12
|
|
13
13
|
import airbyte_cdk.sources.utils.casing as casing
|
14
|
-
from airbyte_cdk.models import AirbyteMessage, AirbyteStream, SyncMode
|
14
|
+
from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, SyncMode
|
15
15
|
from airbyte_cdk.models import Type as MessageType
|
16
16
|
|
17
17
|
# list of all possible HTTP methods which can be used for sending of request bodies
|
@@ -31,6 +31,10 @@ StreamData = Union[Mapping[str, Any], AirbyteMessage]
|
|
31
31
|
|
32
32
|
JsonSchema = Mapping[str, Any]
|
33
33
|
|
34
|
+
# Streams that only support full refresh don't have a suitable cursor so this sentinel
|
35
|
+
# value is used to indicate that stream should not load the incoming state value
|
36
|
+
FULL_REFRESH_SENTINEL_STATE_KEY = "__ab_full_refresh_state_message"
|
37
|
+
|
34
38
|
|
35
39
|
def package_name_from_class(cls: object) -> str:
|
36
40
|
"""Find the package name given a class name"""
|
@@ -107,39 +111,24 @@ class Stream(ABC):
|
|
107
111
|
"""
|
108
112
|
return None
|
109
113
|
|
110
|
-
def
|
111
|
-
self,
|
112
|
-
cursor_field: Optional[List[str]],
|
113
|
-
logger: logging.Logger,
|
114
|
-
slice_logger: SliceLogger,
|
115
|
-
) -> Iterable[StreamData]:
|
116
|
-
slices = self.stream_slices(sync_mode=SyncMode.full_refresh, cursor_field=cursor_field)
|
117
|
-
logger.debug(f"Processing stream slices for {self.name} (sync_mode: full_refresh)", extra={"stream_slices": slices})
|
118
|
-
for _slice in slices:
|
119
|
-
if slice_logger.should_log_slice_message(logger):
|
120
|
-
yield slice_logger.create_slice_log_message(_slice)
|
121
|
-
yield from self.read_records(
|
122
|
-
stream_slice=_slice,
|
123
|
-
sync_mode=SyncMode.full_refresh,
|
124
|
-
cursor_field=cursor_field,
|
125
|
-
)
|
126
|
-
|
127
|
-
def read_incremental( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
|
114
|
+
def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
|
128
115
|
self,
|
129
|
-
|
116
|
+
configured_stream: ConfiguredAirbyteStream,
|
130
117
|
logger: logging.Logger,
|
131
118
|
slice_logger: SliceLogger,
|
132
119
|
stream_state: MutableMapping[str, Any],
|
133
120
|
state_manager,
|
134
|
-
per_stream_state_enabled: bool,
|
135
121
|
internal_config: InternalConfig,
|
136
122
|
) -> Iterable[StreamData]:
|
123
|
+
sync_mode = configured_stream.sync_mode
|
124
|
+
cursor_field = configured_stream.cursor_field
|
125
|
+
|
137
126
|
slices = self.stream_slices(
|
138
127
|
cursor_field=cursor_field,
|
139
|
-
sync_mode=
|
128
|
+
sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
|
140
129
|
stream_state=stream_state,
|
141
130
|
)
|
142
|
-
logger.debug(f"Processing stream slices for {self.name} (sync_mode:
|
131
|
+
logger.debug(f"Processing stream slices for {self.name} (sync_mode: {sync_mode.name})", extra={"stream_slices": slices})
|
143
132
|
|
144
133
|
has_slices = False
|
145
134
|
record_counter = 0
|
@@ -148,7 +137,7 @@ class Stream(ABC):
|
|
148
137
|
if slice_logger.should_log_slice_message(logger):
|
149
138
|
yield slice_logger.create_slice_log_message(_slice)
|
150
139
|
records = self.read_records(
|
151
|
-
sync_mode=
|
140
|
+
sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
|
152
141
|
stream_slice=_slice,
|
153
142
|
stream_state=stream_state,
|
154
143
|
cursor_field=cursor_field or None,
|
@@ -160,20 +149,34 @@ class Stream(ABC):
|
|
160
149
|
):
|
161
150
|
record_data = record_data_or_message if isinstance(record_data_or_message, Mapping) else record_data_or_message.record
|
162
151
|
stream_state = self.get_updated_state(stream_state, record_data)
|
163
|
-
checkpoint_interval = self.state_checkpoint_interval
|
164
152
|
record_counter += 1
|
165
|
-
|
166
|
-
|
153
|
+
|
154
|
+
if sync_mode == SyncMode.incremental:
|
155
|
+
# Checkpoint intervals are a bit controversial, but see below comment about why we're gating it right now
|
156
|
+
checkpoint_interval = self.state_checkpoint_interval
|
157
|
+
if checkpoint_interval and record_counter % checkpoint_interval == 0:
|
158
|
+
airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
|
159
|
+
yield airbyte_state_message
|
167
160
|
|
168
161
|
if internal_config.is_limit_reached(record_counter):
|
169
162
|
break
|
170
163
|
|
171
|
-
|
164
|
+
if sync_mode == SyncMode.incremental:
|
165
|
+
# Even though right now, only incremental streams running as incremental mode will emit periodic checkpoints. Rather than
|
166
|
+
# overhaul how refresh interacts with the platform, this positions the code so that once we want to start emitting
|
167
|
+
# periodic checkpoints in full refresh mode it can be done here
|
168
|
+
airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
|
169
|
+
yield airbyte_state_message
|
170
|
+
|
171
|
+
if not has_slices or sync_mode == SyncMode.full_refresh:
|
172
|
+
if sync_mode == SyncMode.full_refresh:
|
173
|
+
# We use a dummy state if there is no suitable value provided by full_refresh streams that do not have a valid cursor.
|
174
|
+
# Incremental streams running full_refresh mode emit a meaningful state
|
175
|
+
stream_state = stream_state or {FULL_REFRESH_SENTINEL_STATE_KEY: True}
|
172
176
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
yield checkpoint
|
177
|
+
# We should always emit a final state message for full refresh sync or streams that do not have any slices
|
178
|
+
airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
|
179
|
+
yield airbyte_state_message
|
177
180
|
|
178
181
|
@abstractmethod
|
179
182
|
def read_records(
|
@@ -361,7 +364,6 @@ class Stream(ABC):
|
|
361
364
|
self,
|
362
365
|
stream_state: Mapping[str, Any],
|
363
366
|
state_manager,
|
364
|
-
per_stream_state_enabled: bool,
|
365
367
|
) -> AirbyteMessage:
|
366
368
|
# First attempt to retrieve the current state using the stream's state property. We receive an AttributeError if the state
|
367
369
|
# property is not implemented by the stream instance and as a fallback, use the stream_state retrieved from the stream
|
@@ -373,4 +375,4 @@ class Stream(ABC):
|
|
373
375
|
|
374
376
|
except AttributeError:
|
375
377
|
state_manager.update_state_for_stream(self.name, self.namespace, stream_state)
|
376
|
-
return state_manager.create_state_message(self.name, self.namespace
|
378
|
+
return state_manager.create_state_message(self.name, self.namespace)
|