airbyte-cdk 6.6.0rc1__py3-none-any.whl → 6.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +10 -4
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +2 -1
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +58 -28
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +45 -12
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +6 -3
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +45 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +23 -3
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +85 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +6 -13
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +69 -19
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -11
- airbyte_cdk/sources/streams/concurrent/adapters.py +4 -102
- airbyte_cdk/sources/streams/concurrent/cursor.py +50 -17
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +0 -15
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +7 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +5 -1
- airbyte_cdk/utils/slice_hasher.py +30 -0
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/METADATA +7 -8
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/RECORD +25 -22
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/WHEEL +0 -0
@@ -2,18 +2,17 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from abc import
|
6
|
-
from dataclasses import dataclass
|
7
|
-
from typing import Iterable
|
5
|
+
from abc import ABC
|
8
6
|
|
9
7
|
from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import (
|
10
8
|
RequestOptionsProvider,
|
11
9
|
)
|
12
|
-
from airbyte_cdk.sources.
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import (
|
11
|
+
StreamSlicer as ConcurrentStreamSlicer,
|
12
|
+
)
|
13
13
|
|
14
14
|
|
15
|
-
|
16
|
-
class StreamSlicer(RequestOptionsProvider):
|
15
|
+
class StreamSlicer(ConcurrentStreamSlicer, RequestOptionsProvider, ABC):
|
17
16
|
"""
|
18
17
|
Slices the stream into a subset of records.
|
19
18
|
Slices enable state checkpointing and data retrieval parallelization.
|
@@ -23,10 +22,4 @@ class StreamSlicer(RequestOptionsProvider):
|
|
23
22
|
See the stream slicing section of the docs for more information.
|
24
23
|
"""
|
25
24
|
|
26
|
-
|
27
|
-
def stream_slices(self) -> Iterable[StreamSlice]:
|
28
|
-
"""
|
29
|
-
Defines stream slices
|
30
|
-
|
31
|
-
:return: List of stream slices
|
32
|
-
"""
|
25
|
+
pass
|
@@ -29,16 +29,25 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
|
29
29
|
from airbyte_cdk.utils import is_cloud_environment
|
30
30
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
31
31
|
from unstructured.file_utils.filetype import (
|
32
|
+
EXT_TO_FILETYPE,
|
32
33
|
FILETYPE_TO_MIMETYPE,
|
33
34
|
STR_TO_FILETYPE,
|
34
35
|
FileType,
|
35
36
|
detect_filetype,
|
36
37
|
)
|
38
|
+
import nltk
|
37
39
|
|
38
40
|
unstructured_partition_pdf = None
|
39
41
|
unstructured_partition_docx = None
|
40
42
|
unstructured_partition_pptx = None
|
41
43
|
|
44
|
+
try:
|
45
|
+
nltk.data.find("tokenizers/punkt.zip")
|
46
|
+
nltk.data.find("tokenizers/punkt_tab.zip")
|
47
|
+
except LookupError:
|
48
|
+
nltk.download("punkt")
|
49
|
+
nltk.download("punkt_tab")
|
50
|
+
|
42
51
|
|
43
52
|
def optional_decode(contents: Union[str, bytes]) -> str:
|
44
53
|
if isinstance(contents, bytes):
|
@@ -108,9 +117,11 @@ class UnstructuredParser(FileTypeParser):
|
|
108
117
|
format = _extract_format(config)
|
109
118
|
with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
|
110
119
|
filetype = self._get_filetype(file_handle, file)
|
111
|
-
|
112
120
|
if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
|
113
|
-
raise self._create_parse_error(
|
121
|
+
raise self._create_parse_error(
|
122
|
+
file,
|
123
|
+
self._get_file_type_error_message(filetype),
|
124
|
+
)
|
114
125
|
|
115
126
|
return {
|
116
127
|
"content": {
|
@@ -159,6 +170,10 @@ class UnstructuredParser(FileTypeParser):
|
|
159
170
|
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
|
160
171
|
else:
|
161
172
|
raise e
|
173
|
+
except Exception as e:
|
174
|
+
exception_str = str(e)
|
175
|
+
logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
|
176
|
+
raise e
|
162
177
|
|
163
178
|
def _read_file(
|
164
179
|
self,
|
@@ -176,20 +191,32 @@ class UnstructuredParser(FileTypeParser):
|
|
176
191
|
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
|
177
192
|
raise Exception("unstructured library is not available")
|
178
193
|
|
179
|
-
filetype = self._get_filetype(file_handle, remote_file)
|
194
|
+
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
|
180
195
|
|
181
|
-
if filetype
|
196
|
+
if filetype is None or filetype not in self._supported_file_types():
|
197
|
+
raise self._create_parse_error(
|
198
|
+
remote_file,
|
199
|
+
self._get_file_type_error_message(filetype),
|
200
|
+
)
|
201
|
+
if filetype in {FileType.MD, FileType.TXT}:
|
182
202
|
file_content: bytes = file_handle.read()
|
183
203
|
decoded_content: str = optional_decode(file_content)
|
184
204
|
return decoded_content
|
185
|
-
if filetype not in self._supported_file_types():
|
186
|
-
raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
|
187
205
|
if format.processing.mode == "local":
|
188
|
-
return self._read_file_locally(
|
206
|
+
return self._read_file_locally(
|
207
|
+
file_handle,
|
208
|
+
filetype,
|
209
|
+
format.strategy,
|
210
|
+
remote_file,
|
211
|
+
)
|
189
212
|
elif format.processing.mode == "api":
|
190
213
|
try:
|
191
214
|
result: str = self._read_file_remotely_with_retries(
|
192
|
-
file_handle,
|
215
|
+
file_handle,
|
216
|
+
format.processing,
|
217
|
+
filetype,
|
218
|
+
format.strategy,
|
219
|
+
remote_file,
|
193
220
|
)
|
194
221
|
except Exception as e:
|
195
222
|
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
|
@@ -336,7 +363,11 @@ class UnstructuredParser(FileTypeParser):
|
|
336
363
|
|
337
364
|
return self._render_markdown([element.to_dict() for element in elements])
|
338
365
|
|
339
|
-
def _create_parse_error(
|
366
|
+
def _create_parse_error(
|
367
|
+
self,
|
368
|
+
remote_file: RemoteFile,
|
369
|
+
message: str,
|
370
|
+
) -> RecordParseError:
|
340
371
|
return RecordParseError(
|
341
372
|
FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
|
342
373
|
)
|
@@ -360,32 +391,51 @@ class UnstructuredParser(FileTypeParser):
|
|
360
391
|
# detect_filetype is either using the file name or file content
|
361
392
|
# if possible, try to leverage the file name to detect the file type
|
362
393
|
# if the file name is not available, use the file content
|
363
|
-
file_type =
|
364
|
-
|
365
|
-
|
366
|
-
|
394
|
+
file_type: FileType | None = None
|
395
|
+
try:
|
396
|
+
file_type = detect_filetype(
|
397
|
+
filename=remote_file.uri,
|
398
|
+
)
|
399
|
+
except Exception:
|
400
|
+
# Path doesn't exist locally. Try something else...
|
401
|
+
pass
|
402
|
+
|
403
|
+
if file_type and file_type != FileType.UNK:
|
367
404
|
return file_type
|
368
405
|
|
369
406
|
type_based_on_content = detect_filetype(file=file)
|
407
|
+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
|
370
408
|
|
371
|
-
|
372
|
-
|
409
|
+
if type_based_on_content and type_based_on_content != FileType.UNK:
|
410
|
+
return type_based_on_content
|
373
411
|
|
374
|
-
|
412
|
+
extension = "." + remote_file.uri.split(".")[-1].lower()
|
413
|
+
if extension in EXT_TO_FILETYPE:
|
414
|
+
return EXT_TO_FILETYPE[extension]
|
415
|
+
|
416
|
+
return None
|
375
417
|
|
376
418
|
def _supported_file_types(self) -> List[Any]:
|
377
419
|
return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
|
378
420
|
|
379
|
-
def _get_file_type_error_message(
|
421
|
+
def _get_file_type_error_message(
|
422
|
+
self,
|
423
|
+
file_type: FileType | None,
|
424
|
+
) -> str:
|
380
425
|
supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
|
381
|
-
return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
|
426
|
+
return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
|
382
427
|
|
383
428
|
def _render_markdown(self, elements: List[Any]) -> str:
|
384
429
|
return "\n\n".join((self._convert_to_markdown(el) for el in elements))
|
385
430
|
|
386
431
|
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
|
387
432
|
if dpath.get(el, "type") == "Title":
|
388
|
-
|
433
|
+
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
|
434
|
+
if not isinstance(category_depth, int):
|
435
|
+
category_depth = (
|
436
|
+
int(category_depth) if isinstance(category_depth, (str, float)) else 1
|
437
|
+
)
|
438
|
+
heading_str = "#" * category_depth
|
389
439
|
return f"{heading_str} {dpath.get(el, 'text')}"
|
390
440
|
elif dpath.get(el, "type") == "ListItem":
|
391
441
|
return f"- {dpath.get(el, 'text')}"
|
@@ -226,7 +226,6 @@ class FileBasedStreamPartition(Partition):
|
|
226
226
|
sync_mode: SyncMode,
|
227
227
|
cursor_field: Optional[List[str]],
|
228
228
|
state: Optional[MutableMapping[str, Any]],
|
229
|
-
cursor: "AbstractConcurrentFileBasedCursor",
|
230
229
|
):
|
231
230
|
self._stream = stream
|
232
231
|
self._slice = _slice
|
@@ -234,8 +233,6 @@ class FileBasedStreamPartition(Partition):
|
|
234
233
|
self._sync_mode = sync_mode
|
235
234
|
self._cursor_field = cursor_field
|
236
235
|
self._state = state
|
237
|
-
self._cursor = cursor
|
238
|
-
self._is_closed = False
|
239
236
|
|
240
237
|
def read(self) -> Iterable[Record]:
|
241
238
|
try:
|
@@ -289,13 +286,6 @@ class FileBasedStreamPartition(Partition):
|
|
289
286
|
file = self._slice["files"][0]
|
290
287
|
return {"files": [file]}
|
291
288
|
|
292
|
-
def close(self) -> None:
|
293
|
-
self._cursor.close_partition(self)
|
294
|
-
self._is_closed = True
|
295
|
-
|
296
|
-
def is_closed(self) -> bool:
|
297
|
-
return self._is_closed
|
298
|
-
|
299
289
|
def __hash__(self) -> int:
|
300
290
|
if self._slice:
|
301
291
|
# Convert the slice to a string so that it can be hashed
|
@@ -352,7 +342,6 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
|
|
352
342
|
self._sync_mode,
|
353
343
|
self._cursor_field,
|
354
344
|
self._state,
|
355
|
-
self._cursor,
|
356
345
|
)
|
357
346
|
)
|
358
347
|
self._cursor.set_pending_partitions(pending_partitions)
|
@@ -38,15 +38,13 @@ from airbyte_cdk.sources.streams.concurrent.helpers import (
|
|
38
38
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
39
39
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
40
40
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
41
|
-
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
42
|
-
DateTimeStreamStateConverter,
|
43
|
-
)
|
44
41
|
from airbyte_cdk.sources.streams.core import StreamData
|
45
|
-
from airbyte_cdk.sources.types import StreamSlice
|
46
42
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
47
43
|
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
48
44
|
from deprecated.classic import deprecated
|
49
45
|
|
46
|
+
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
47
|
+
|
50
48
|
"""
|
51
49
|
This module contains adapters to help enabling concurrency on Stream objects without needing to migrate to AbstractStream
|
52
50
|
"""
|
@@ -96,7 +94,6 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
96
94
|
else SyncMode.incremental,
|
97
95
|
[cursor_field] if cursor_field is not None else None,
|
98
96
|
state,
|
99
|
-
cursor,
|
100
97
|
),
|
101
98
|
name=stream.name,
|
102
99
|
namespace=stream.namespace,
|
@@ -259,7 +256,6 @@ class StreamPartition(Partition):
|
|
259
256
|
sync_mode: SyncMode,
|
260
257
|
cursor_field: Optional[List[str]],
|
261
258
|
state: Optional[MutableMapping[str, Any]],
|
262
|
-
cursor: Cursor,
|
263
259
|
):
|
264
260
|
"""
|
265
261
|
:param stream: The stream to delegate to
|
@@ -272,8 +268,7 @@ class StreamPartition(Partition):
|
|
272
268
|
self._sync_mode = sync_mode
|
273
269
|
self._cursor_field = cursor_field
|
274
270
|
self._state = state
|
275
|
-
self.
|
276
|
-
self._is_closed = False
|
271
|
+
self._hash = SliceHasher.hash(self._stream.name, self._slice)
|
277
272
|
|
278
273
|
def read(self) -> Iterable[Record]:
|
279
274
|
"""
|
@@ -313,23 +308,11 @@ class StreamPartition(Partition):
|
|
313
308
|
return self._slice
|
314
309
|
|
315
310
|
def __hash__(self) -> int:
|
316
|
-
|
317
|
-
# Convert the slice to a string so that it can be hashed
|
318
|
-
s = json.dumps(self._slice, sort_keys=True, cls=SliceEncoder)
|
319
|
-
return hash((self._stream.name, s))
|
320
|
-
else:
|
321
|
-
return hash(self._stream.name)
|
311
|
+
return self._hash
|
322
312
|
|
323
313
|
def stream_name(self) -> str:
|
324
314
|
return self._stream.name
|
325
315
|
|
326
|
-
def close(self) -> None:
|
327
|
-
self._cursor.close_partition(self)
|
328
|
-
self._is_closed = True
|
329
|
-
|
330
|
-
def is_closed(self) -> bool:
|
331
|
-
return self._is_closed
|
332
|
-
|
333
316
|
def __repr__(self) -> str:
|
334
317
|
return f"StreamPartition({self._stream.name}, {self._slice})"
|
335
318
|
|
@@ -349,7 +332,6 @@ class StreamPartitionGenerator(PartitionGenerator):
|
|
349
332
|
sync_mode: SyncMode,
|
350
333
|
cursor_field: Optional[List[str]],
|
351
334
|
state: Optional[MutableMapping[str, Any]],
|
352
|
-
cursor: Cursor,
|
353
335
|
):
|
354
336
|
"""
|
355
337
|
:param stream: The stream to delegate to
|
@@ -360,7 +342,6 @@ class StreamPartitionGenerator(PartitionGenerator):
|
|
360
342
|
self._sync_mode = sync_mode
|
361
343
|
self._cursor_field = cursor_field
|
362
344
|
self._state = state
|
363
|
-
self._cursor = cursor
|
364
345
|
|
365
346
|
def generate(self) -> Iterable[Partition]:
|
366
347
|
for s in self._stream.stream_slices(
|
@@ -373,85 +354,6 @@ class StreamPartitionGenerator(PartitionGenerator):
|
|
373
354
|
self._sync_mode,
|
374
355
|
self._cursor_field,
|
375
356
|
self._state,
|
376
|
-
self._cursor,
|
377
|
-
)
|
378
|
-
|
379
|
-
|
380
|
-
class CursorPartitionGenerator(PartitionGenerator):
|
381
|
-
"""
|
382
|
-
This class generates partitions using the concurrent cursor and iterates through state slices to generate partitions.
|
383
|
-
|
384
|
-
It is used when synchronizing a stream in incremental or full-refresh mode where state information is maintained
|
385
|
-
across partitions. Each partition represents a subset of the stream's data and is determined by the cursor's state.
|
386
|
-
"""
|
387
|
-
|
388
|
-
_START_BOUNDARY = 0
|
389
|
-
_END_BOUNDARY = 1
|
390
|
-
|
391
|
-
def __init__(
|
392
|
-
self,
|
393
|
-
stream: Stream,
|
394
|
-
message_repository: MessageRepository,
|
395
|
-
cursor: Cursor,
|
396
|
-
connector_state_converter: DateTimeStreamStateConverter,
|
397
|
-
cursor_field: Optional[List[str]],
|
398
|
-
slice_boundary_fields: Optional[Tuple[str, str]],
|
399
|
-
):
|
400
|
-
"""
|
401
|
-
Initialize the CursorPartitionGenerator with a stream, sync mode, and cursor.
|
402
|
-
|
403
|
-
:param stream: The stream to delegate to for partition generation.
|
404
|
-
:param message_repository: The message repository to use to emit non-record messages.
|
405
|
-
:param sync_mode: The synchronization mode.
|
406
|
-
:param cursor: A Cursor object that maintains the state and the cursor field.
|
407
|
-
"""
|
408
|
-
self._stream = stream
|
409
|
-
self.message_repository = message_repository
|
410
|
-
self._sync_mode = SyncMode.full_refresh
|
411
|
-
self._cursor = cursor
|
412
|
-
self._cursor_field = cursor_field
|
413
|
-
self._state = self._cursor.state
|
414
|
-
self._slice_boundary_fields = slice_boundary_fields
|
415
|
-
self._connector_state_converter = connector_state_converter
|
416
|
-
|
417
|
-
def generate(self) -> Iterable[Partition]:
|
418
|
-
"""
|
419
|
-
Generate partitions based on the slices in the cursor's state.
|
420
|
-
|
421
|
-
This method iterates through the list of slices found in the cursor's state, and for each slice, it generates
|
422
|
-
a `StreamPartition` object.
|
423
|
-
|
424
|
-
:return: An iterable of StreamPartition objects.
|
425
|
-
"""
|
426
|
-
|
427
|
-
start_boundary = (
|
428
|
-
self._slice_boundary_fields[self._START_BOUNDARY]
|
429
|
-
if self._slice_boundary_fields
|
430
|
-
else "start"
|
431
|
-
)
|
432
|
-
end_boundary = (
|
433
|
-
self._slice_boundary_fields[self._END_BOUNDARY]
|
434
|
-
if self._slice_boundary_fields
|
435
|
-
else "end"
|
436
|
-
)
|
437
|
-
|
438
|
-
for slice_start, slice_end in self._cursor.generate_slices():
|
439
|
-
stream_slice = StreamSlice(
|
440
|
-
partition={},
|
441
|
-
cursor_slice={
|
442
|
-
start_boundary: self._connector_state_converter.output_format(slice_start),
|
443
|
-
end_boundary: self._connector_state_converter.output_format(slice_end),
|
444
|
-
},
|
445
|
-
)
|
446
|
-
|
447
|
-
yield StreamPartition(
|
448
|
-
self._stream,
|
449
|
-
copy.deepcopy(stream_slice),
|
450
|
-
self.message_repository,
|
451
|
-
self._sync_mode,
|
452
|
-
self._cursor_field,
|
453
|
-
self._state,
|
454
|
-
self._cursor,
|
455
357
|
)
|
456
358
|
|
457
359
|
|
@@ -11,9 +11,11 @@ from airbyte_cdk.sources.message import MessageRepository
|
|
11
11
|
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
12
12
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
14
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
14
15
|
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
|
15
16
|
AbstractStreamStateConverter,
|
16
17
|
)
|
18
|
+
from airbyte_cdk.sources.types import StreamSlice
|
17
19
|
|
18
20
|
|
19
21
|
def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any:
|
@@ -61,7 +63,7 @@ class CursorField:
|
|
61
63
|
return cursor_value # type: ignore # we assume that the value the path points at is a comparable
|
62
64
|
|
63
65
|
|
64
|
-
class Cursor(ABC):
|
66
|
+
class Cursor(StreamSlicer, ABC):
|
65
67
|
@property
|
66
68
|
@abstractmethod
|
67
69
|
def state(self) -> MutableMapping[str, Any]: ...
|
@@ -88,12 +90,12 @@ class Cursor(ABC):
|
|
88
90
|
"""
|
89
91
|
raise NotImplementedError()
|
90
92
|
|
91
|
-
def
|
93
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
92
94
|
"""
|
93
95
|
Default placeholder implementation of generate_slices.
|
94
96
|
Subclasses can override this method to provide actual behavior.
|
95
97
|
"""
|
96
|
-
yield
|
98
|
+
yield StreamSlice(partition={}, cursor_slice={})
|
97
99
|
|
98
100
|
|
99
101
|
class FinalStateCursor(Cursor):
|
@@ -184,8 +186,15 @@ class ConcurrentCursor(Cursor):
|
|
184
186
|
return self._cursor_field
|
185
187
|
|
186
188
|
@property
|
187
|
-
def
|
188
|
-
return
|
189
|
+
def _slice_boundary_fields_wrapper(self) -> Tuple[str, str]:
|
190
|
+
return (
|
191
|
+
self._slice_boundary_fields
|
192
|
+
if self._slice_boundary_fields
|
193
|
+
else (
|
194
|
+
self._connector_state_converter.START_KEY,
|
195
|
+
self._connector_state_converter.END_KEY,
|
196
|
+
)
|
197
|
+
)
|
189
198
|
|
190
199
|
def _get_concurrent_state(
|
191
200
|
self, state: MutableMapping[str, Any]
|
@@ -299,7 +308,7 @@ class ConcurrentCursor(Cursor):
|
|
299
308
|
"""
|
300
309
|
self._emit_state_message()
|
301
310
|
|
302
|
-
def
|
311
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
303
312
|
"""
|
304
313
|
Generating slices based on a few parameters:
|
305
314
|
* lookback_window: Buffer to remove from END_KEY of the highest slice
|
@@ -368,7 +377,7 @@ class ConcurrentCursor(Cursor):
|
|
368
377
|
|
369
378
|
def _split_per_slice_range(
|
370
379
|
self, lower: CursorValueType, upper: CursorValueType, upper_is_end: bool
|
371
|
-
) -> Iterable[
|
380
|
+
) -> Iterable[StreamSlice]:
|
372
381
|
if lower >= upper:
|
373
382
|
return
|
374
383
|
|
@@ -377,10 +386,22 @@ class ConcurrentCursor(Cursor):
|
|
377
386
|
|
378
387
|
lower = max(lower, self._start) if self._start else lower
|
379
388
|
if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper:
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
389
|
+
start_value, end_value = (
|
390
|
+
(lower, upper - self._cursor_granularity)
|
391
|
+
if self._cursor_granularity and not upper_is_end
|
392
|
+
else (lower, upper)
|
393
|
+
)
|
394
|
+
yield StreamSlice(
|
395
|
+
partition={},
|
396
|
+
cursor_slice={
|
397
|
+
self._slice_boundary_fields_wrapper[
|
398
|
+
self._START_BOUNDARY
|
399
|
+
]: self._connector_state_converter.output_format(start_value),
|
400
|
+
self._slice_boundary_fields_wrapper[
|
401
|
+
self._END_BOUNDARY
|
402
|
+
]: self._connector_state_converter.output_format(end_value),
|
403
|
+
},
|
404
|
+
)
|
384
405
|
else:
|
385
406
|
stop_processing = False
|
386
407
|
current_lower_boundary = lower
|
@@ -389,12 +410,24 @@ class ConcurrentCursor(Cursor):
|
|
389
410
|
self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper
|
390
411
|
)
|
391
412
|
has_reached_upper_boundary = current_upper_boundary >= upper
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
413
|
+
|
414
|
+
start_value, end_value = (
|
415
|
+
(current_lower_boundary, current_upper_boundary - self._cursor_granularity)
|
416
|
+
if self._cursor_granularity
|
417
|
+
and (not upper_is_end or not has_reached_upper_boundary)
|
418
|
+
else (current_lower_boundary, current_upper_boundary)
|
419
|
+
)
|
420
|
+
yield StreamSlice(
|
421
|
+
partition={},
|
422
|
+
cursor_slice={
|
423
|
+
self._slice_boundary_fields_wrapper[
|
424
|
+
self._START_BOUNDARY
|
425
|
+
]: self._connector_state_converter.output_format(start_value),
|
426
|
+
self._slice_boundary_fields_wrapper[
|
427
|
+
self._END_BOUNDARY
|
428
|
+
]: self._connector_state_converter.output_format(end_value),
|
429
|
+
},
|
430
|
+
)
|
398
431
|
current_lower_boundary = current_upper_boundary
|
399
432
|
if current_upper_boundary >= upper:
|
400
433
|
stop_processing = True
|
@@ -40,21 +40,6 @@ class Partition(ABC):
|
|
40
40
|
"""
|
41
41
|
pass
|
42
42
|
|
43
|
-
@abstractmethod
|
44
|
-
def close(self) -> None:
|
45
|
-
"""
|
46
|
-
Closes the partition.
|
47
|
-
"""
|
48
|
-
pass
|
49
|
-
|
50
|
-
@abstractmethod
|
51
|
-
def is_closed(self) -> bool:
|
52
|
-
"""
|
53
|
-
Returns whether the partition is closed.
|
54
|
-
:return:
|
55
|
-
"""
|
56
|
-
pass
|
57
|
-
|
58
43
|
@abstractmethod
|
59
44
|
def __hash__(self) -> int:
|
60
45
|
"""
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Iterable
|
5
|
+
|
6
|
+
from airbyte_cdk.sources.types import StreamSlice
|
7
|
+
|
8
|
+
|
9
|
+
class StreamSlicer(ABC):
|
10
|
+
"""
|
11
|
+
Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.
|
12
|
+
"""
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
16
|
+
"""
|
17
|
+
Defines stream slices
|
18
|
+
|
19
|
+
:return: An iterable of stream slices
|
20
|
+
"""
|
21
|
+
pass
|
@@ -124,6 +124,13 @@ class AbstractStreamStateConverter(ABC):
|
|
124
124
|
"""
|
125
125
|
...
|
126
126
|
|
127
|
+
@abstractmethod
|
128
|
+
def output_format(self, value: Any) -> Any:
|
129
|
+
"""
|
130
|
+
Convert the cursor value type to a JSON valid type.
|
131
|
+
"""
|
132
|
+
...
|
133
|
+
|
127
134
|
def merge_intervals(
|
128
135
|
self, intervals: List[MutableMapping[str, Any]]
|
129
136
|
) -> List[MutableMapping[str, Any]]:
|
@@ -82,7 +82,11 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
82
82
|
# The start and end are the same to avoid confusion as to whether the records for this slice
|
83
83
|
# were actually synced
|
84
84
|
slices = [
|
85
|
-
{
|
85
|
+
{
|
86
|
+
self.START_KEY: start if start is not None else sync_start,
|
87
|
+
self.END_KEY: sync_start,
|
88
|
+
self.MOST_RECENT_RECORD_KEY: sync_start,
|
89
|
+
}
|
86
90
|
]
|
87
91
|
|
88
92
|
return sync_start, {
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import hashlib
|
2
|
+
import json
|
3
|
+
from typing import Any, Mapping, Optional, Final
|
4
|
+
|
5
|
+
|
6
|
+
class SliceEncoder(json.JSONEncoder):
|
7
|
+
def default(self, obj: Any) -> Any:
|
8
|
+
if hasattr(obj, "__json_serializable__"):
|
9
|
+
return obj.__json_serializable__()
|
10
|
+
|
11
|
+
# Let the base class default method raise the TypeError
|
12
|
+
return super().default(obj)
|
13
|
+
|
14
|
+
|
15
|
+
class SliceHasher:
|
16
|
+
_ENCODING: Final = "utf-8"
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
def hash(cls, stream_name: str, stream_slice: Optional[Mapping[str, Any]] = None) -> int:
|
20
|
+
if stream_slice:
|
21
|
+
try:
|
22
|
+
s = json.dumps(stream_slice, sort_keys=True, cls=SliceEncoder)
|
23
|
+
hash_input = f"{stream_name}:{s}".encode(cls._ENCODING)
|
24
|
+
except TypeError as e:
|
25
|
+
raise ValueError(f"Failed to serialize stream slice: {e}")
|
26
|
+
else:
|
27
|
+
hash_input = stream_name.encode(cls._ENCODING)
|
28
|
+
|
29
|
+
# Use last 8 bytes as 64-bit integer for better distribution
|
30
|
+
return int.from_bytes(hashlib.sha256(hash_input).digest()[-8:], "big")
|