airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/METADATA +3 -3
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/RECORD +28 -28
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.3.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
|
|
23
23
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
24
24
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
25
25
|
from airbyte_cdk.sources.source import Source
|
26
|
-
from airbyte_cdk.sources.streams import Stream
|
26
|
+
from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
|
27
27
|
from airbyte_cdk.sources.streams.core import StreamData
|
28
28
|
from airbyte_cdk.sources.streams.http.http import HttpStream
|
29
29
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
|
|
181
181
|
def raise_exception_on_missing_stream(self) -> bool:
|
182
182
|
return True
|
183
183
|
|
184
|
-
@property
|
185
|
-
def per_stream_state_enabled(self) -> bool:
|
186
|
-
return True
|
187
|
-
|
188
184
|
def _read_stream(
|
189
185
|
self,
|
190
186
|
logger: logging.Logger,
|
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
|
|
206
202
|
)
|
207
203
|
stream_instance.log_stream_sync_configuration()
|
208
204
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
205
|
+
stream_name = configured_stream.stream.name
|
206
|
+
# The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
|
207
|
+
# connector, but right now we need to prevent accidental usage of the previous stream state
|
208
|
+
stream_state = (
|
209
|
+
state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
210
|
+
if configured_stream.sync_mode == SyncMode.incremental
|
211
|
+
else {}
|
212
|
+
)
|
213
|
+
|
214
|
+
if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
|
215
|
+
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
216
|
+
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
217
|
+
|
218
|
+
record_iterator = stream_instance.read(
|
219
|
+
configured_stream,
|
220
|
+
logger,
|
221
|
+
self._slice_logger,
|
222
|
+
stream_state,
|
223
|
+
state_manager,
|
224
|
+
internal_config,
|
225
|
+
)
|
220
226
|
|
221
227
|
record_counter = 0
|
222
|
-
stream_name = configured_stream.stream.name
|
223
228
|
logger.info(f"Syncing stream: {stream_name} ")
|
224
|
-
for
|
229
|
+
for record_data_or_message in record_iterator:
|
230
|
+
record = self._get_message(record_data_or_message, stream_instance)
|
225
231
|
if record.type == MessageType.RECORD:
|
226
232
|
record_counter += 1
|
227
233
|
if record_counter == 1:
|
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
|
|
233
239
|
|
234
240
|
logger.info(f"Read {record_counter} records from {stream_name} stream")
|
235
241
|
|
236
|
-
def _read_incremental(
|
237
|
-
self,
|
238
|
-
logger: logging.Logger,
|
239
|
-
stream_instance: Stream,
|
240
|
-
configured_stream: ConfiguredAirbyteStream,
|
241
|
-
state_manager: ConnectorStateManager,
|
242
|
-
internal_config: InternalConfig,
|
243
|
-
) -> Iterator[AirbyteMessage]:
|
244
|
-
"""Read stream using incremental algorithm
|
245
|
-
|
246
|
-
:param logger:
|
247
|
-
:param stream_instance:
|
248
|
-
:param configured_stream:
|
249
|
-
:param state_manager:
|
250
|
-
:param internal_config:
|
251
|
-
:return:
|
252
|
-
"""
|
253
|
-
stream_name = configured_stream.stream.name
|
254
|
-
stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
|
255
|
-
|
256
|
-
if stream_state and "state" in dir(stream_instance):
|
257
|
-
stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
|
258
|
-
logger.info(f"Setting state of {self.name} stream to {stream_state}")
|
259
|
-
|
260
|
-
for record_data_or_message in stream_instance.read_incremental(
|
261
|
-
configured_stream.cursor_field,
|
262
|
-
logger,
|
263
|
-
self._slice_logger,
|
264
|
-
stream_state,
|
265
|
-
state_manager,
|
266
|
-
self.per_stream_state_enabled,
|
267
|
-
internal_config,
|
268
|
-
):
|
269
|
-
yield self._get_message(record_data_or_message, stream_instance)
|
270
|
-
|
271
242
|
def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
|
272
243
|
if self.message_repository:
|
273
244
|
yield from self.message_repository.consume_queue()
|
274
245
|
return
|
275
246
|
|
276
|
-
def _read_full_refresh(
|
277
|
-
self,
|
278
|
-
logger: logging.Logger,
|
279
|
-
stream_instance: Stream,
|
280
|
-
configured_stream: ConfiguredAirbyteStream,
|
281
|
-
internal_config: InternalConfig,
|
282
|
-
) -> Iterator[AirbyteMessage]:
|
283
|
-
total_records_counter = 0
|
284
|
-
for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
|
285
|
-
message = self._get_message(record_data_or_message, stream_instance)
|
286
|
-
yield message
|
287
|
-
if message.type == MessageType.RECORD:
|
288
|
-
total_records_counter += 1
|
289
|
-
if internal_config.is_limit_reached(total_records_counter):
|
290
|
-
return
|
291
|
-
|
292
247
|
def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
|
293
248
|
"""
|
294
249
|
Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
|
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
|
|
317
272
|
def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
|
318
273
|
failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
|
319
274
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
275
|
+
|
276
|
+
@staticmethod
|
277
|
+
def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
|
278
|
+
# For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
|
279
|
+
# This key is never used by a connector and is needed during a read to skip assigning the incoming state.
|
280
|
+
return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
|
@@ -77,7 +77,7 @@ class ConnectorStateManager:
|
|
77
77
|
stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
78
78
|
self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
|
79
79
|
|
80
|
-
def create_state_message(self, stream_name: str, namespace: Optional[str]
|
80
|
+
def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
|
81
81
|
"""
|
82
82
|
Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
|
83
83
|
:param stream_name: The name of the stream for the message that is being created
|
@@ -85,25 +85,18 @@ class ConnectorStateManager:
|
|
85
85
|
:param send_per_stream_state: Decides which state format the message should be generated as
|
86
86
|
:return: The Airbyte state message to be emitted by the connector during a sync
|
87
87
|
"""
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
return AirbyteMessage(
|
99
|
-
type=MessageType.STATE,
|
100
|
-
state=AirbyteStateMessage(
|
101
|
-
type=AirbyteStateType.STREAM,
|
102
|
-
stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
|
103
|
-
data=dict(self._get_legacy_state()),
|
88
|
+
hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
|
89
|
+
stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
|
90
|
+
|
91
|
+
return AirbyteMessage(
|
92
|
+
type=MessageType.STATE,
|
93
|
+
state=AirbyteStateMessage(
|
94
|
+
type=AirbyteStateType.STREAM,
|
95
|
+
stream=AirbyteStreamState(
|
96
|
+
stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
|
104
97
|
),
|
105
|
-
)
|
106
|
-
|
98
|
+
),
|
99
|
+
)
|
107
100
|
|
108
101
|
@classmethod
|
109
102
|
def _extract_from_state_message(
|
@@ -176,13 +169,6 @@ class ConnectorStateManager:
|
|
176
169
|
streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
|
177
170
|
return streams
|
178
171
|
|
179
|
-
def _get_legacy_state(self) -> Mapping[str, Any]:
|
180
|
-
"""
|
181
|
-
Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
|
182
|
-
:return: A deep copy of the mapping of stream name to stream state value
|
183
|
-
"""
|
184
|
-
return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
|
185
|
-
|
186
172
|
@staticmethod
|
187
173
|
def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
|
188
174
|
return isinstance(state, dict)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
7
|
import os
|
8
|
-
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
8
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
9
9
|
from urllib.parse import unquote
|
10
10
|
|
11
11
|
import pyarrow as pa
|
@@ -16,7 +16,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
|
|
16
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
18
18
|
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
19
|
-
from pyarrow import Scalar
|
19
|
+
from pyarrow import DictionaryArray, Scalar
|
20
20
|
|
21
21
|
|
22
22
|
class ParquetParser(FileTypeParser):
|
@@ -95,10 +95,23 @@ class ParquetParser(FileTypeParser):
|
|
95
95
|
return FileReadMode.READ_BINARY
|
96
96
|
|
97
97
|
@staticmethod
|
98
|
-
def _to_output_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
|
98
|
+
def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
|
99
|
+
"""
|
100
|
+
Convert an entry in a pyarrow table to a value that can be output by the source.
|
101
|
+
"""
|
102
|
+
if isinstance(parquet_value, DictionaryArray):
|
103
|
+
return ParquetParser._dictionary_array_to_python_value(parquet_value)
|
104
|
+
else:
|
105
|
+
return ParquetParser._scalar_to_python_value(parquet_value, parquet_format)
|
106
|
+
|
107
|
+
@staticmethod
|
108
|
+
def _scalar_to_python_value(parquet_value: Scalar, parquet_format: ParquetFormat) -> Any:
|
99
109
|
"""
|
100
110
|
Convert a pyarrow scalar to a value that can be output by the source.
|
101
111
|
"""
|
112
|
+
if parquet_value.as_py() is None:
|
113
|
+
return None
|
114
|
+
|
102
115
|
# Convert date and datetime objects to isoformat strings
|
103
116
|
if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
|
104
117
|
return parquet_value.as_py().isoformat()
|
@@ -109,23 +122,14 @@ class ParquetParser(FileTypeParser):
|
|
109
122
|
|
110
123
|
# Decode binary strings to utf-8
|
111
124
|
if ParquetParser._is_binary(parquet_value.type):
|
112
|
-
|
113
|
-
|
114
|
-
return py_value
|
115
|
-
return py_value.decode("utf-8")
|
125
|
+
return parquet_value.as_py().decode("utf-8")
|
126
|
+
|
116
127
|
if pa.types.is_decimal(parquet_value.type):
|
117
128
|
if parquet_format.decimal_as_float:
|
118
129
|
return parquet_value.as_py()
|
119
130
|
else:
|
120
131
|
return str(parquet_value.as_py())
|
121
132
|
|
122
|
-
# Dictionaries are stored as two columns: indices and values
|
123
|
-
# The indices column is an array of integers that maps to the values column
|
124
|
-
if pa.types.is_dictionary(parquet_value.type):
|
125
|
-
return {
|
126
|
-
"indices": parquet_value.indices.tolist(),
|
127
|
-
"values": parquet_value.dictionary.tolist(),
|
128
|
-
}
|
129
133
|
if pa.types.is_map(parquet_value.type):
|
130
134
|
return {k: v for k, v in parquet_value.as_py()}
|
131
135
|
|
@@ -149,6 +153,20 @@ class ParquetParser(FileTypeParser):
|
|
149
153
|
else:
|
150
154
|
return parquet_value.as_py()
|
151
155
|
|
156
|
+
@staticmethod
|
157
|
+
def _dictionary_array_to_python_value(parquet_value: DictionaryArray) -> Dict[str, Any]:
|
158
|
+
"""
|
159
|
+
Convert a pyarrow dictionary array to a value that can be output by the source.
|
160
|
+
|
161
|
+
Dictionaries are stored as two columns: indices and values
|
162
|
+
The indices column is an array of integers that maps to the values column
|
163
|
+
"""
|
164
|
+
|
165
|
+
return {
|
166
|
+
"indices": parquet_value.indices.tolist(),
|
167
|
+
"values": parquet_value.dictionary.tolist(),
|
168
|
+
}
|
169
|
+
|
152
170
|
@staticmethod
|
153
171
|
def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
|
154
172
|
"""
|
@@ -7,7 +7,7 @@ import logging
|
|
7
7
|
from functools import lru_cache
|
8
8
|
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
|
-
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
|
10
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteStream, Level, SyncMode, Type
|
11
11
|
from airbyte_cdk.sources import AbstractSource
|
12
12
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
13
13
|
from airbyte_cdk.sources.file_based.availability_strategy import (
|
@@ -156,29 +156,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
156
156
|
def get_underlying_stream(self) -> DefaultStream:
|
157
157
|
return self._abstract_stream
|
158
158
|
|
159
|
-
def
|
159
|
+
def read(
|
160
160
|
self,
|
161
|
-
|
162
|
-
logger: logging.Logger,
|
163
|
-
slice_logger: SliceLogger,
|
164
|
-
) -> Iterable[StreamData]:
|
165
|
-
"""
|
166
|
-
Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
|
167
|
-
:param cursor_field: (ignored)
|
168
|
-
:param logger: (ignored)
|
169
|
-
:param slice_logger: (ignored)
|
170
|
-
:return: Iterable of StreamData
|
171
|
-
"""
|
172
|
-
yield from self._read_records()
|
173
|
-
|
174
|
-
def read_incremental(
|
175
|
-
self,
|
176
|
-
cursor_field: Optional[List[str]],
|
161
|
+
configured_stream: ConfiguredAirbyteStream,
|
177
162
|
logger: logging.Logger,
|
178
163
|
slice_logger: SliceLogger,
|
179
164
|
stream_state: MutableMapping[str, Any],
|
180
165
|
state_manager: ConnectorStateManager,
|
181
|
-
per_stream_state_enabled: bool,
|
182
166
|
internal_config: InternalConfig,
|
183
167
|
) -> Iterable[StreamData]:
|
184
168
|
yield from self._read_records()
|
@@ -155,9 +155,7 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
155
155
|
self._stream_namespace,
|
156
156
|
new_state,
|
157
157
|
)
|
158
|
-
state_message = self._connector_state_manager.create_state_message(
|
159
|
-
self._stream_name, self._stream_namespace, send_per_stream_state=True
|
160
|
-
)
|
158
|
+
state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
|
161
159
|
self._message_repository.emit_message(state_message)
|
162
160
|
|
163
161
|
def _get_new_cursor_value(self) -> str:
|
@@ -3,6 +3,6 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
# Initialize Streams Package
|
6
|
-
from .core import IncrementalMixin, Stream
|
6
|
+
from .core import FULL_REFRESH_SENTINEL_STATE_KEY, IncrementalMixin, Stream
|
7
7
|
|
8
|
-
__all__ = ["IncrementalMixin", "Stream"]
|
8
|
+
__all__ = ["FULL_REFRESH_SENTINEL_STATE_KEY", "IncrementalMixin", "Stream"]
|
@@ -8,7 +8,7 @@ import logging
|
|
8
8
|
from functools import lru_cache
|
9
9
|
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
|
10
10
|
|
11
|
-
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, Level, SyncMode, Type
|
11
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, Level, SyncMode, Type
|
12
12
|
from airbyte_cdk.sources import AbstractSource, Source
|
13
13
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
14
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -116,29 +116,13 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
116
116
|
self._slice_logger = slice_logger
|
117
117
|
self._logger = logger
|
118
118
|
|
119
|
-
def
|
119
|
+
def read(
|
120
120
|
self,
|
121
|
-
|
122
|
-
logger: logging.Logger,
|
123
|
-
slice_logger: SliceLogger,
|
124
|
-
) -> Iterable[StreamData]:
|
125
|
-
"""
|
126
|
-
Read full refresh. Delegate to the underlying AbstractStream, ignoring all the parameters
|
127
|
-
:param cursor_field: (ignored)
|
128
|
-
:param logger: (ignored)
|
129
|
-
:param slice_logger: (ignored)
|
130
|
-
:return: Iterable of StreamData
|
131
|
-
"""
|
132
|
-
yield from self._read_records()
|
133
|
-
|
134
|
-
def read_incremental(
|
135
|
-
self,
|
136
|
-
cursor_field: Optional[List[str]],
|
121
|
+
configured_stream: ConfiguredAirbyteStream,
|
137
122
|
logger: logging.Logger,
|
138
123
|
slice_logger: SliceLogger,
|
139
124
|
stream_state: MutableMapping[str, Any],
|
140
125
|
state_manager: ConnectorStateManager,
|
141
|
-
per_stream_state_enabled: bool,
|
142
126
|
internal_config: InternalConfig,
|
143
127
|
) -> Iterable[StreamData]:
|
144
128
|
yield from self._read_records()
|
@@ -184,9 +184,7 @@ class ConcurrentCursor(Cursor):
|
|
184
184
|
# TODO: if we migrate stored state to the concurrent state format
|
185
185
|
# (aka stop calling self._connector_state_converter.convert_to_sequential_state`), we'll need to cast datetimes to string or
|
186
186
|
# int before emitting state
|
187
|
-
state_message = self._connector_state_manager.create_state_message(
|
188
|
-
self._stream_name, self._stream_namespace, send_per_stream_state=True
|
189
|
-
)
|
187
|
+
state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
|
190
188
|
self._message_repository.emit_message(state_message)
|
191
189
|
|
192
190
|
def _merge_partitions(self) -> None:
|
@@ -11,7 +11,7 @@ from functools import lru_cache
|
|
11
11
|
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
|
12
12
|
|
13
13
|
import airbyte_cdk.sources.utils.casing as casing
|
14
|
-
from airbyte_cdk.models import AirbyteMessage, AirbyteStream, SyncMode
|
14
|
+
from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, SyncMode
|
15
15
|
from airbyte_cdk.models import Type as MessageType
|
16
16
|
|
17
17
|
# list of all possible HTTP methods which can be used for sending of request bodies
|
@@ -31,6 +31,10 @@ StreamData = Union[Mapping[str, Any], AirbyteMessage]
|
|
31
31
|
|
32
32
|
JsonSchema = Mapping[str, Any]
|
33
33
|
|
34
|
+
# Streams that only support full refresh don't have a suitable cursor so this sentinel
|
35
|
+
# value is used to indicate that stream should not load the incoming state value
|
36
|
+
FULL_REFRESH_SENTINEL_STATE_KEY = "__ab_full_refresh_state_message"
|
37
|
+
|
34
38
|
|
35
39
|
def package_name_from_class(cls: object) -> str:
|
36
40
|
"""Find the package name given a class name"""
|
@@ -107,39 +111,24 @@ class Stream(ABC):
|
|
107
111
|
"""
|
108
112
|
return None
|
109
113
|
|
110
|
-
def
|
111
|
-
self,
|
112
|
-
cursor_field: Optional[List[str]],
|
113
|
-
logger: logging.Logger,
|
114
|
-
slice_logger: SliceLogger,
|
115
|
-
) -> Iterable[StreamData]:
|
116
|
-
slices = self.stream_slices(sync_mode=SyncMode.full_refresh, cursor_field=cursor_field)
|
117
|
-
logger.debug(f"Processing stream slices for {self.name} (sync_mode: full_refresh)", extra={"stream_slices": slices})
|
118
|
-
for _slice in slices:
|
119
|
-
if slice_logger.should_log_slice_message(logger):
|
120
|
-
yield slice_logger.create_slice_log_message(_slice)
|
121
|
-
yield from self.read_records(
|
122
|
-
stream_slice=_slice,
|
123
|
-
sync_mode=SyncMode.full_refresh,
|
124
|
-
cursor_field=cursor_field,
|
125
|
-
)
|
126
|
-
|
127
|
-
def read_incremental( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
|
114
|
+
def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies
|
128
115
|
self,
|
129
|
-
|
116
|
+
configured_stream: ConfiguredAirbyteStream,
|
130
117
|
logger: logging.Logger,
|
131
118
|
slice_logger: SliceLogger,
|
132
119
|
stream_state: MutableMapping[str, Any],
|
133
120
|
state_manager,
|
134
|
-
per_stream_state_enabled: bool,
|
135
121
|
internal_config: InternalConfig,
|
136
122
|
) -> Iterable[StreamData]:
|
123
|
+
sync_mode = configured_stream.sync_mode
|
124
|
+
cursor_field = configured_stream.cursor_field
|
125
|
+
|
137
126
|
slices = self.stream_slices(
|
138
127
|
cursor_field=cursor_field,
|
139
|
-
sync_mode=
|
128
|
+
sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
|
140
129
|
stream_state=stream_state,
|
141
130
|
)
|
142
|
-
logger.debug(f"Processing stream slices for {self.name} (sync_mode:
|
131
|
+
logger.debug(f"Processing stream slices for {self.name} (sync_mode: {sync_mode.name})", extra={"stream_slices": slices})
|
143
132
|
|
144
133
|
has_slices = False
|
145
134
|
record_counter = 0
|
@@ -148,7 +137,7 @@ class Stream(ABC):
|
|
148
137
|
if slice_logger.should_log_slice_message(logger):
|
149
138
|
yield slice_logger.create_slice_log_message(_slice)
|
150
139
|
records = self.read_records(
|
151
|
-
sync_mode=
|
140
|
+
sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior
|
152
141
|
stream_slice=_slice,
|
153
142
|
stream_state=stream_state,
|
154
143
|
cursor_field=cursor_field or None,
|
@@ -160,20 +149,34 @@ class Stream(ABC):
|
|
160
149
|
):
|
161
150
|
record_data = record_data_or_message if isinstance(record_data_or_message, Mapping) else record_data_or_message.record
|
162
151
|
stream_state = self.get_updated_state(stream_state, record_data)
|
163
|
-
checkpoint_interval = self.state_checkpoint_interval
|
164
152
|
record_counter += 1
|
165
|
-
|
166
|
-
|
153
|
+
|
154
|
+
if sync_mode == SyncMode.incremental:
|
155
|
+
# Checkpoint intervals are a bit controversial, but see below comment about why we're gating it right now
|
156
|
+
checkpoint_interval = self.state_checkpoint_interval
|
157
|
+
if checkpoint_interval and record_counter % checkpoint_interval == 0:
|
158
|
+
airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
|
159
|
+
yield airbyte_state_message
|
167
160
|
|
168
161
|
if internal_config.is_limit_reached(record_counter):
|
169
162
|
break
|
170
163
|
|
171
|
-
|
164
|
+
if sync_mode == SyncMode.incremental:
|
165
|
+
# Even though right now, only incremental streams running as incremental mode will emit periodic checkpoints. Rather than
|
166
|
+
# overhaul how refresh interacts with the platform, this positions the code so that once we want to start emitting
|
167
|
+
# periodic checkpoints in full refresh mode it can be done here
|
168
|
+
airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
|
169
|
+
yield airbyte_state_message
|
170
|
+
|
171
|
+
if not has_slices or sync_mode == SyncMode.full_refresh:
|
172
|
+
if sync_mode == SyncMode.full_refresh:
|
173
|
+
# We use a dummy state if there is no suitable value provided by full_refresh streams that do not have a valid cursor.
|
174
|
+
# Incremental streams running full_refresh mode emit a meaningful state
|
175
|
+
stream_state = stream_state or {FULL_REFRESH_SENTINEL_STATE_KEY: True}
|
172
176
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
yield checkpoint
|
177
|
+
# We should always emit a final state message for full refresh sync or streams that do not have any slices
|
178
|
+
airbyte_state_message = self._checkpoint_state(stream_state, state_manager)
|
179
|
+
yield airbyte_state_message
|
177
180
|
|
178
181
|
@abstractmethod
|
179
182
|
def read_records(
|
@@ -361,7 +364,6 @@ class Stream(ABC):
|
|
361
364
|
self,
|
362
365
|
stream_state: Mapping[str, Any],
|
363
366
|
state_manager,
|
364
|
-
per_stream_state_enabled: bool,
|
365
367
|
) -> AirbyteMessage:
|
366
368
|
# First attempt to retrieve the current state using the stream's state property. We receive an AttributeError if the state
|
367
369
|
# property is not implemented by the stream instance and as a fallback, use the stream_state retrieved from the stream
|
@@ -373,4 +375,4 @@ class Stream(ABC):
|
|
373
375
|
|
374
376
|
except AttributeError:
|
375
377
|
state_manager.update_state_for_stream(self.name, self.namespace, stream_state)
|
376
|
-
return state_manager.create_state_message(self.name, self.namespace
|
378
|
+
return state_manager.create_state_message(self.name, self.namespace)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: airbyte-cdk
|
3
|
-
Version: 0.67.
|
3
|
+
Version: 0.67.3
|
4
4
|
Summary: A framework for writing Airbyte Connectors.
|
5
5
|
Home-page: https://github.com/airbytehq/airbyte
|
6
6
|
Author: Airbyte
|
@@ -48,7 +48,7 @@ Requires-Dist: pytest-mock ; extra == 'dev'
|
|
48
48
|
Requires-Dist: requests-mock ; extra == 'dev'
|
49
49
|
Requires-Dist: pytest-httpserver ; extra == 'dev'
|
50
50
|
Requires-Dist: pandas ==2.0.3 ; extra == 'dev'
|
51
|
-
Requires-Dist: pyarrow
|
51
|
+
Requires-Dist: pyarrow ~=15.0.0 ; extra == 'dev'
|
52
52
|
Requires-Dist: langchain ==0.0.271 ; extra == 'dev'
|
53
53
|
Requires-Dist: openai[embeddings] ==0.27.9 ; extra == 'dev'
|
54
54
|
Requires-Dist: cohere ==4.21 ; extra == 'dev'
|
@@ -63,7 +63,7 @@ Requires-Dist: markdown ; extra == 'dev'
|
|
63
63
|
Provides-Extra: file-based
|
64
64
|
Requires-Dist: avro ~=1.11.2 ; extra == 'file-based'
|
65
65
|
Requires-Dist: fastavro ~=1.8.0 ; extra == 'file-based'
|
66
|
-
Requires-Dist: pyarrow
|
66
|
+
Requires-Dist: pyarrow ~=15.0.0 ; extra == 'file-based'
|
67
67
|
Requires-Dist: unstructured ==0.10.27 ; extra == 'file-based'
|
68
68
|
Requires-Dist: unstructured[docx,pptx] ==0.10.27 ; extra == 'file-based'
|
69
69
|
Requires-Dist: pdf2image ==1.16.3 ; extra == 'file-based'
|