airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
- airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/config/test_csv_format.py +6 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
|
|
23
23
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
24
24
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
25
25
|
from airbyte_cdk.sources.source import Source
|
26
|
-
from airbyte_cdk.sources.streams import Stream
|
26
|
+
from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
|
27
27
|
from airbyte_cdk.sources.streams.core import StreamData
|
28
28
|
from airbyte_cdk.sources.streams.http.http import HttpStream
|
29
29
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
|
|
181
181
|
def raise_exception_on_missing_stream(self) -> bool:
|
182
182
|
return True
|
183
183
|
|
184
|
-
@property
|
185
|
-
def per_stream_state_enabled(self) -> bool:
|
186
|
-
return True
|
187
|
-
|
188
184
|
def _read_stream(
|
189
185
|
self,
|
190
186
|
logger: logging.Logger,
|
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
|
|
206
202
|
)
|
207
203
|
stream_instance.log_stream_sync_configuration()
|
208
204
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
205
|
+
stream_name = configured_stream.stream.name
|
206
|
+
# The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
|
207
|
+
# connector, but right now we need to prevent accidental usage of the previous stream state
|
208
|
+
stream_state = (
|
209
|
+
state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
210
|
+
if configured_stream.sync_mode == SyncMode.incremental
|
211
|
+
else {}
|
212
|
+
)
|
213
|
+
|
214
|
+
if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
|
215
|
+
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
216
|
+
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
217
|
+
|
218
|
+
record_iterator = stream_instance.read(
|
219
|
+
configured_stream,
|
220
|
+
logger,
|
221
|
+
self._slice_logger,
|
222
|
+
stream_state,
|
223
|
+
state_manager,
|
224
|
+
internal_config,
|
225
|
+
)
|
220
226
|
|
221
227
|
record_counter = 0
|
222
|
-
stream_name = configured_stream.stream.name
|
223
228
|
logger.info(f"Syncing stream: {stream_name} ")
|
224
|
-
for
|
229
|
+
for record_data_or_message in record_iterator:
|
230
|
+
record = self._get_message(record_data_or_message, stream_instance)
|
225
231
|
if record.type == MessageType.RECORD:
|
226
232
|
record_counter += 1
|
227
233
|
if record_counter == 1:
|
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
|
|
233
239
|
|
234
240
|
logger.info(f"Read {record_counter} records from {stream_name} stream")
|
235
241
|
|
236
|
-
def _read_incremental(
|
237
|
-
self,
|
238
|
-
logger: logging.Logger,
|
239
|
-
stream_instance: Stream,
|
240
|
-
configured_stream: ConfiguredAirbyteStream,
|
241
|
-
state_manager: ConnectorStateManager,
|
242
|
-
internal_config: InternalConfig,
|
243
|
-
) -> Iterator[AirbyteMessage]:
|
244
|
-
"""Read stream using incremental algorithm
|
245
|
-
|
246
|
-
:param logger:
|
247
|
-
:param stream_instance:
|
248
|
-
:param configured_stream:
|
249
|
-
:param state_manager:
|
250
|
-
:param internal_config:
|
251
|
-
:return:
|
252
|
-
"""
|
253
|
-
stream_name = configured_stream.stream.name
|
254
|
-
stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
255
|
-
|
256
|
-
if stream_state and "state" in dir(stream_instance):
|
257
|
-
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
258
|
-
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
259
|
-
|
260
|
-
for record_data_or_message in stream_instance.read_incremental(
|
261
|
-
configured_stream.cursor_field,
|
262
|
-
logger,
|
263
|
-
self._slice_logger,
|
264
|
-
stream_state,
|
265
|
-
state_manager,
|
266
|
-
self.per_stream_state_enabled,
|
267
|
-
internal_config,
|
268
|
-
):
|
269
|
-
yield self._get_message(record_data_or_message, stream_instance)
|
270
|
-
|
271
242
|
def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
|
272
243
|
if self.message_repository:
|
273
244
|
yield from self.message_repository.consume_queue()
|
274
245
|
return
|
275
246
|
|
276
|
-
def _read_full_refresh(
|
277
|
-
self,
|
278
|
-
logger: logging.Logger,
|
279
|
-
stream_instance: Stream,
|
280
|
-
configured_stream: ConfiguredAirbyteStream,
|
281
|
-
internal_config: InternalConfig,
|
282
|
-
) -> Iterator[AirbyteMessage]:
|
283
|
-
total_records_counter = 0
|
284
|
-
for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
|
285
|
-
message = self._get_message(record_data_or_message, stream_instance)
|
286
|
-
yield message
|
287
|
-
if message.type == MessageType.RECORD:
|
288
|
-
total_records_counter += 1
|
289
|
-
if internal_config.is_limit_reached(total_records_counter):
|
290
|
-
return
|
291
|
-
|
292
247
|
def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
|
293
248
|
"""
|
294
249
|
Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
|
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
|
|
317
272
|
def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
|
318
273
|
failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
|
319
274
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
275
|
+
|
276
|
+
@staticmethod
|
277
|
+
def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
|
278
|
+
# For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
|
279
|
+
# This key is never used by a connector and is needed during a read to skip assigning the incoming state.
|
280
|
+
return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
|
@@ -77,7 +77,7 @@ class ConnectorStateManager:
|
|
77
77
|
stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
78
78
|
self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
|
79
79
|
|
80
|
-
def create_state_message(self, stream_name: str, namespace: Optional[str]
|
80
|
+
def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
|
81
81
|
"""
|
82
82
|
Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
|
83
83
|
:param stream_name: The name of the stream for the message that is being created
|
@@ -85,25 +85,18 @@ class ConnectorStateManager:
|
|
85
85
|
:param send_per_stream_state: Decides which state format the message should be generated as
|
86
86
|
:return: The Airbyte state message to be emitted by the connector during a sync
|
87
87
|
"""
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
return AirbyteMessage(
|
99
|
-
type=MessageType.STATE,
|
100
|
-
state=AirbyteStateMessage(
|
101
|
-
type=AirbyteStateType.STREAM,
|
102
|
-
stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
|
103
|
-
data=dict(self._get_legacy_state()),
|
88
|
+
hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
89
|
+
stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
|
90
|
+
|
91
|
+
return AirbyteMessage(
|
92
|
+
type=MessageType.STATE,
|
93
|
+
state=AirbyteStateMessage(
|
94
|
+
type=AirbyteStateType.STREAM,
|
95
|
+
stream=AirbyteStreamState(
|
96
|
+
stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
|
104
97
|
),
|
105
|
-
)
|
106
|
-
|
98
|
+
),
|
99
|
+
)
|
107
100
|
|
108
101
|
@classmethod
|
109
102
|
def _extract_from_state_message(
|
@@ -176,13 +169,6 @@ class ConnectorStateManager:
|
|
176
169
|
streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
|
177
170
|
return streams
|
178
171
|
|
179
|
-
def _get_legacy_state(self) -> Mapping[str, Any]:
|
180
|
-
"""
|
181
|
-
Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
|
182
|
-
:return: A deep copy of the mapping of stream name to stream state value
|
183
|
-
"""
|
184
|
-
return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
|
185
|
-
|
186
172
|
@staticmethod
|
187
173
|
def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
|
188
174
|
return isinstance(state, dict)
|