airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
- airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/config/test_csv_format.py +6 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
|
|
23
23
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
24
24
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
25
25
|
from airbyte_cdk.sources.source import Source
|
26
|
-
from airbyte_cdk.sources.streams import Stream
|
26
|
+
from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
|
27
27
|
from airbyte_cdk.sources.streams.core import StreamData
|
28
28
|
from airbyte_cdk.sources.streams.http.http import HttpStream
|
29
29
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
|
|
181
181
|
def raise_exception_on_missing_stream(self) -> bool:
|
182
182
|
return True
|
183
183
|
|
184
|
-
@property
|
185
|
-
def per_stream_state_enabled(self) -> bool:
|
186
|
-
return True
|
187
|
-
|
188
184
|
def _read_stream(
|
189
185
|
self,
|
190
186
|
logger: logging.Logger,
|
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
|
|
206
202
|
)
|
207
203
|
stream_instance.log_stream_sync_configuration()
|
208
204
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
205
|
+
stream_name = configured_stream.stream.name
|
206
|
+
# The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
|
207
|
+
# connector, but right now we need to prevent accidental usage of the previous stream state
|
208
|
+
stream_state = (
|
209
|
+
state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
210
|
+
if configured_stream.sync_mode == SyncMode.incremental
|
211
|
+
else {}
|
212
|
+
)
|
213
|
+
|
214
|
+
if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
|
215
|
+
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
216
|
+
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
217
|
+
|
218
|
+
record_iterator = stream_instance.read(
|
219
|
+
configured_stream,
|
220
|
+
logger,
|
221
|
+
self._slice_logger,
|
222
|
+
stream_state,
|
223
|
+
state_manager,
|
224
|
+
internal_config,
|
225
|
+
)
|
220
226
|
|
221
227
|
record_counter = 0
|
222
|
-
stream_name = configured_stream.stream.name
|
223
228
|
logger.info(f"Syncing stream: {stream_name} ")
|
224
|
-
for
|
229
|
+
for record_data_or_message in record_iterator:
|
230
|
+
record = self._get_message(record_data_or_message, stream_instance)
|
225
231
|
if record.type == MessageType.RECORD:
|
226
232
|
record_counter += 1
|
227
233
|
if record_counter == 1:
|
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
|
|
233
239
|
|
234
240
|
logger.info(f"Read {record_counter} records from {stream_name} stream")
|
235
241
|
|
236
|
-
def _read_incremental(
|
237
|
-
self,
|
238
|
-
logger: logging.Logger,
|
239
|
-
stream_instance: Stream,
|
240
|
-
configured_stream: ConfiguredAirbyteStream,
|
241
|
-
state_manager: ConnectorStateManager,
|
242
|
-
internal_config: InternalConfig,
|
243
|
-
) -> Iterator[AirbyteMessage]:
|
244
|
-
"""Read stream using incremental algorithm
|
245
|
-
|
246
|
-
:param logger:
|
247
|
-
:param stream_instance:
|
248
|
-
:param configured_stream:
|
249
|
-
:param state_manager:
|
250
|
-
:param internal_config:
|
251
|
-
:return:
|
252
|
-
"""
|
253
|
-
stream_name = configured_stream.stream.name
|
254
|
-
stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
255
|
-
|
256
|
-
if stream_state and "state" in dir(stream_instance):
|
257
|
-
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
258
|
-
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
259
|
-
|
260
|
-
for record_data_or_message in stream_instance.read_incremental(
|
261
|
-
configured_stream.cursor_field,
|
262
|
-
logger,
|
263
|
-
self._slice_logger,
|
264
|
-
stream_state,
|
265
|
-
state_manager,
|
266
|
-
self.per_stream_state_enabled,
|
267
|
-
internal_config,
|
268
|
-
):
|
269
|
-
yield self._get_message(record_data_or_message, stream_instance)
|
270
|
-
|
271
242
|
def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
|
272
243
|
if self.message_repository:
|
273
244
|
yield from self.message_repository.consume_queue()
|
274
245
|
return
|
275
246
|
|
276
|
-
def _read_full_refresh(
|
277
|
-
self,
|
278
|
-
logger: logging.Logger,
|
279
|
-
stream_instance: Stream,
|
280
|
-
configured_stream: ConfiguredAirbyteStream,
|
281
|
-
internal_config: InternalConfig,
|
282
|
-
) -> Iterator[AirbyteMessage]:
|
283
|
-
total_records_counter = 0
|
284
|
-
for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
|
285
|
-
message = self._get_message(record_data_or_message, stream_instance)
|
286
|
-
yield message
|
287
|
-
if message.type == MessageType.RECORD:
|
288
|
-
total_records_counter += 1
|
289
|
-
if internal_config.is_limit_reached(total_records_counter):
|
290
|
-
return
|
291
|
-
|
292
247
|
def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
|
293
248
|
"""
|
294
249
|
Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
|
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
|
|
317
272
|
def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
|
318
273
|
failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
|
319
274
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
275
|
+
|
276
|
+
@staticmethod
|
277
|
+
def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
|
278
|
+
# For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
|
279
|
+
# This key is never used by a connector and is needed during a read to skip assigning the incoming state.
|
280
|
+
return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
|
@@ -77,7 +77,7 @@ class ConnectorStateManager:
|
|
77
77
|
stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
78
78
|
self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
|
79
79
|
|
80
|
-
def create_state_message(self, stream_name: str, namespace: Optional[str]
|
80
|
+
def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
|
81
81
|
"""
|
82
82
|
Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
|
83
83
|
:param stream_name: The name of the stream for the message that is being created
|
@@ -85,25 +85,18 @@ class ConnectorStateManager:
|
|
85
85
|
:param send_per_stream_state: Decides which state format the message should be generated as
|
86
86
|
:return: The Airbyte state message to be emitted by the connector during a sync
|
87
87
|
"""
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
return AirbyteMessage(
|
99
|
-
type=MessageType.STATE,
|
100
|
-
state=AirbyteStateMessage(
|
101
|
-
type=AirbyteStateType.STREAM,
|
102
|
-
stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
|
103
|
-
data=dict(self._get_legacy_state()),
|
88
|
+
hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
89
|
+
stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
|
90
|
+
|
91
|
+
return AirbyteMessage(
|
92
|
+
type=MessageType.STATE,
|
93
|
+
state=AirbyteStateMessage(
|
94
|
+
type=AirbyteStateType.STREAM,
|
95
|
+
stream=AirbyteStreamState(
|
96
|
+
stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
|
104
97
|
),
|
105
|
-
)
|
106
|
-
|
98
|
+
),
|
99
|
+
)
|
107
100
|
|
108
101
|
@classmethod
|
109
102
|
def _extract_from_state_message(
|
@@ -176,13 +169,6 @@ class ConnectorStateManager:
|
|
176
169
|
streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
|
177
170
|
return streams
|
178
171
|
|
179
|
-
def _get_legacy_state(self) -> Mapping[str, Any]:
|
180
|
-
"""
|
181
|
-
Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
|
182
|
-
:return: A deep copy of the mapping of stream name to stream state value
|
183
|
-
"""
|
184
|
-
return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
|
185
|
-
|
186
172
|
@staticmethod
|
187
173
|
def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
|
188
174
|
return isinstance(state, dict)
|