airbyte-cdk 6.6.0rc1__py3-none-any.whl → 6.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. airbyte_cdk/__init__.py +10 -4
  2. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +2 -1
  3. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +58 -28
  4. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
  5. airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
  6. airbyte_cdk/sources/declarative/decoders/json_decoder.py +45 -12
  7. airbyte_cdk/sources/declarative/manifest_declarative_source.py +6 -3
  8. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -3
  9. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +45 -4
  10. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +23 -3
  11. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +85 -0
  12. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +6 -13
  13. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +69 -19
  14. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -11
  15. airbyte_cdk/sources/streams/concurrent/adapters.py +4 -102
  16. airbyte_cdk/sources/streams/concurrent/cursor.py +50 -17
  17. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +0 -15
  18. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  19. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +7 -0
  20. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +5 -1
  21. airbyte_cdk/utils/slice_hasher.py +30 -0
  22. {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/METADATA +7 -8
  23. {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/RECORD +25 -22
  24. {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/LICENSE.txt +0 -0
  25. {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.2.dist-info}/WHEEL +0 -0
@@ -2,18 +2,17 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from abc import abstractmethod
6
- from dataclasses import dataclass
7
- from typing import Iterable
5
+ from abc import ABC
8
6
 
9
7
  from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import (
10
8
  RequestOptionsProvider,
11
9
  )
12
- from airbyte_cdk.sources.types import StreamSlice
10
+ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import (
11
+ StreamSlicer as ConcurrentStreamSlicer,
12
+ )
13
13
 
14
14
 
15
- @dataclass
16
- class StreamSlicer(RequestOptionsProvider):
15
+ class StreamSlicer(ConcurrentStreamSlicer, RequestOptionsProvider, ABC):
17
16
  """
18
17
  Slices the stream into a subset of records.
19
18
  Slices enable state checkpointing and data retrieval parallelization.
@@ -23,10 +22,4 @@ class StreamSlicer(RequestOptionsProvider):
23
22
  See the stream slicing section of the docs for more information.
24
23
  """
25
24
 
26
- @abstractmethod
27
- def stream_slices(self) -> Iterable[StreamSlice]:
28
- """
29
- Defines stream slices
30
-
31
- :return: List of stream slices
32
- """
25
+ pass
@@ -29,16 +29,25 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
29
29
  from airbyte_cdk.utils import is_cloud_environment
30
30
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
31
31
  from unstructured.file_utils.filetype import (
32
+ EXT_TO_FILETYPE,
32
33
  FILETYPE_TO_MIMETYPE,
33
34
  STR_TO_FILETYPE,
34
35
  FileType,
35
36
  detect_filetype,
36
37
  )
38
+ import nltk
37
39
 
38
40
  unstructured_partition_pdf = None
39
41
  unstructured_partition_docx = None
40
42
  unstructured_partition_pptx = None
41
43
 
44
+ try:
45
+ nltk.data.find("tokenizers/punkt.zip")
46
+ nltk.data.find("tokenizers/punkt_tab.zip")
47
+ except LookupError:
48
+ nltk.download("punkt")
49
+ nltk.download("punkt_tab")
50
+
42
51
 
43
52
  def optional_decode(contents: Union[str, bytes]) -> str:
44
53
  if isinstance(contents, bytes):
@@ -108,9 +117,11 @@ class UnstructuredParser(FileTypeParser):
108
117
  format = _extract_format(config)
109
118
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
110
119
  filetype = self._get_filetype(file_handle, file)
111
-
112
120
  if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
113
- raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
121
+ raise self._create_parse_error(
122
+ file,
123
+ self._get_file_type_error_message(filetype),
124
+ )
114
125
 
115
126
  return {
116
127
  "content": {
@@ -159,6 +170,10 @@ class UnstructuredParser(FileTypeParser):
159
170
  logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
160
171
  else:
161
172
  raise e
173
+ except Exception as e:
174
+ exception_str = str(e)
175
+ logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
176
+ raise e
162
177
 
163
178
  def _read_file(
164
179
  self,
@@ -176,20 +191,32 @@ class UnstructuredParser(FileTypeParser):
176
191
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
177
192
  raise Exception("unstructured library is not available")
178
193
 
179
- filetype = self._get_filetype(file_handle, remote_file)
194
+ filetype: FileType | None = self._get_filetype(file_handle, remote_file)
180
195
 
181
- if filetype == FileType.MD or filetype == FileType.TXT:
196
+ if filetype is None or filetype not in self._supported_file_types():
197
+ raise self._create_parse_error(
198
+ remote_file,
199
+ self._get_file_type_error_message(filetype),
200
+ )
201
+ if filetype in {FileType.MD, FileType.TXT}:
182
202
  file_content: bytes = file_handle.read()
183
203
  decoded_content: str = optional_decode(file_content)
184
204
  return decoded_content
185
- if filetype not in self._supported_file_types():
186
- raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
187
205
  if format.processing.mode == "local":
188
- return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
206
+ return self._read_file_locally(
207
+ file_handle,
208
+ filetype,
209
+ format.strategy,
210
+ remote_file,
211
+ )
189
212
  elif format.processing.mode == "api":
190
213
  try:
191
214
  result: str = self._read_file_remotely_with_retries(
192
- file_handle, format.processing, filetype, format.strategy, remote_file
215
+ file_handle,
216
+ format.processing,
217
+ filetype,
218
+ format.strategy,
219
+ remote_file,
193
220
  )
194
221
  except Exception as e:
195
222
  # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
@@ -336,7 +363,11 @@ class UnstructuredParser(FileTypeParser):
336
363
 
337
364
  return self._render_markdown([element.to_dict() for element in elements])
338
365
 
339
- def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
366
+ def _create_parse_error(
367
+ self,
368
+ remote_file: RemoteFile,
369
+ message: str,
370
+ ) -> RecordParseError:
340
371
  return RecordParseError(
341
372
  FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
342
373
  )
@@ -360,32 +391,51 @@ class UnstructuredParser(FileTypeParser):
360
391
  # detect_filetype is either using the file name or file content
361
392
  # if possible, try to leverage the file name to detect the file type
362
393
  # if the file name is not available, use the file content
363
- file_type = detect_filetype(
364
- filename=remote_file.uri,
365
- )
366
- if file_type is not None and not file_type == FileType.UNK:
394
+ file_type: FileType | None = None
395
+ try:
396
+ file_type = detect_filetype(
397
+ filename=remote_file.uri,
398
+ )
399
+ except Exception:
400
+ # Path doesn't exist locally. Try something else...
401
+ pass
402
+
403
+ if file_type and file_type != FileType.UNK:
367
404
  return file_type
368
405
 
369
406
  type_based_on_content = detect_filetype(file=file)
407
+ file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
370
408
 
371
- # detect_filetype is reading to read the file content
372
- file.seek(0)
409
+ if type_based_on_content and type_based_on_content != FileType.UNK:
410
+ return type_based_on_content
373
411
 
374
- return type_based_on_content
412
+ extension = "." + remote_file.uri.split(".")[-1].lower()
413
+ if extension in EXT_TO_FILETYPE:
414
+ return EXT_TO_FILETYPE[extension]
415
+
416
+ return None
375
417
 
376
418
  def _supported_file_types(self) -> List[Any]:
377
419
  return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
378
420
 
379
- def _get_file_type_error_message(self, file_type: FileType) -> str:
421
+ def _get_file_type_error_message(
422
+ self,
423
+ file_type: FileType | None,
424
+ ) -> str:
380
425
  supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
381
- return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
426
+ return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
382
427
 
383
428
  def _render_markdown(self, elements: List[Any]) -> str:
384
429
  return "\n\n".join((self._convert_to_markdown(el) for el in elements))
385
430
 
386
431
  def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
387
432
  if dpath.get(el, "type") == "Title":
388
- heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
433
+ category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
434
+ if not isinstance(category_depth, int):
435
+ category_depth = (
436
+ int(category_depth) if isinstance(category_depth, (str, float)) else 1
437
+ )
438
+ heading_str = "#" * category_depth
389
439
  return f"{heading_str} {dpath.get(el, 'text')}"
390
440
  elif dpath.get(el, "type") == "ListItem":
391
441
  return f"- {dpath.get(el, 'text')}"
@@ -226,7 +226,6 @@ class FileBasedStreamPartition(Partition):
226
226
  sync_mode: SyncMode,
227
227
  cursor_field: Optional[List[str]],
228
228
  state: Optional[MutableMapping[str, Any]],
229
- cursor: "AbstractConcurrentFileBasedCursor",
230
229
  ):
231
230
  self._stream = stream
232
231
  self._slice = _slice
@@ -234,8 +233,6 @@ class FileBasedStreamPartition(Partition):
234
233
  self._sync_mode = sync_mode
235
234
  self._cursor_field = cursor_field
236
235
  self._state = state
237
- self._cursor = cursor
238
- self._is_closed = False
239
236
 
240
237
  def read(self) -> Iterable[Record]:
241
238
  try:
@@ -289,13 +286,6 @@ class FileBasedStreamPartition(Partition):
289
286
  file = self._slice["files"][0]
290
287
  return {"files": [file]}
291
288
 
292
- def close(self) -> None:
293
- self._cursor.close_partition(self)
294
- self._is_closed = True
295
-
296
- def is_closed(self) -> bool:
297
- return self._is_closed
298
-
299
289
  def __hash__(self) -> int:
300
290
  if self._slice:
301
291
  # Convert the slice to a string so that it can be hashed
@@ -352,7 +342,6 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
352
342
  self._sync_mode,
353
343
  self._cursor_field,
354
344
  self._state,
355
- self._cursor,
356
345
  )
357
346
  )
358
347
  self._cursor.set_pending_partitions(pending_partitions)
@@ -38,15 +38,13 @@ from airbyte_cdk.sources.streams.concurrent.helpers import (
38
38
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
39
39
  from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
40
40
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
41
- from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
42
- DateTimeStreamStateConverter,
43
- )
44
41
  from airbyte_cdk.sources.streams.core import StreamData
45
- from airbyte_cdk.sources.types import StreamSlice
46
42
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
47
43
  from airbyte_cdk.sources.utils.slice_logger import SliceLogger
48
44
  from deprecated.classic import deprecated
49
45
 
46
+ from airbyte_cdk.utils.slice_hasher import SliceHasher
47
+
50
48
  """
51
49
  This module contains adapters to help enabling concurrency on Stream objects without needing to migrate to AbstractStream
52
50
  """
@@ -96,7 +94,6 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
96
94
  else SyncMode.incremental,
97
95
  [cursor_field] if cursor_field is not None else None,
98
96
  state,
99
- cursor,
100
97
  ),
101
98
  name=stream.name,
102
99
  namespace=stream.namespace,
@@ -259,7 +256,6 @@ class StreamPartition(Partition):
259
256
  sync_mode: SyncMode,
260
257
  cursor_field: Optional[List[str]],
261
258
  state: Optional[MutableMapping[str, Any]],
262
- cursor: Cursor,
263
259
  ):
264
260
  """
265
261
  :param stream: The stream to delegate to
@@ -272,8 +268,7 @@ class StreamPartition(Partition):
272
268
  self._sync_mode = sync_mode
273
269
  self._cursor_field = cursor_field
274
270
  self._state = state
275
- self._cursor = cursor
276
- self._is_closed = False
271
+ self._hash = SliceHasher.hash(self._stream.name, self._slice)
277
272
 
278
273
  def read(self) -> Iterable[Record]:
279
274
  """
@@ -313,23 +308,11 @@ class StreamPartition(Partition):
313
308
  return self._slice
314
309
 
315
310
  def __hash__(self) -> int:
316
- if self._slice:
317
- # Convert the slice to a string so that it can be hashed
318
- s = json.dumps(self._slice, sort_keys=True, cls=SliceEncoder)
319
- return hash((self._stream.name, s))
320
- else:
321
- return hash(self._stream.name)
311
+ return self._hash
322
312
 
323
313
  def stream_name(self) -> str:
324
314
  return self._stream.name
325
315
 
326
- def close(self) -> None:
327
- self._cursor.close_partition(self)
328
- self._is_closed = True
329
-
330
- def is_closed(self) -> bool:
331
- return self._is_closed
332
-
333
316
  def __repr__(self) -> str:
334
317
  return f"StreamPartition({self._stream.name}, {self._slice})"
335
318
 
@@ -349,7 +332,6 @@ class StreamPartitionGenerator(PartitionGenerator):
349
332
  sync_mode: SyncMode,
350
333
  cursor_field: Optional[List[str]],
351
334
  state: Optional[MutableMapping[str, Any]],
352
- cursor: Cursor,
353
335
  ):
354
336
  """
355
337
  :param stream: The stream to delegate to
@@ -360,7 +342,6 @@ class StreamPartitionGenerator(PartitionGenerator):
360
342
  self._sync_mode = sync_mode
361
343
  self._cursor_field = cursor_field
362
344
  self._state = state
363
- self._cursor = cursor
364
345
 
365
346
  def generate(self) -> Iterable[Partition]:
366
347
  for s in self._stream.stream_slices(
@@ -373,85 +354,6 @@ class StreamPartitionGenerator(PartitionGenerator):
373
354
  self._sync_mode,
374
355
  self._cursor_field,
375
356
  self._state,
376
- self._cursor,
377
- )
378
-
379
-
380
- class CursorPartitionGenerator(PartitionGenerator):
381
- """
382
- This class generates partitions using the concurrent cursor and iterates through state slices to generate partitions.
383
-
384
- It is used when synchronizing a stream in incremental or full-refresh mode where state information is maintained
385
- across partitions. Each partition represents a subset of the stream's data and is determined by the cursor's state.
386
- """
387
-
388
- _START_BOUNDARY = 0
389
- _END_BOUNDARY = 1
390
-
391
- def __init__(
392
- self,
393
- stream: Stream,
394
- message_repository: MessageRepository,
395
- cursor: Cursor,
396
- connector_state_converter: DateTimeStreamStateConverter,
397
- cursor_field: Optional[List[str]],
398
- slice_boundary_fields: Optional[Tuple[str, str]],
399
- ):
400
- """
401
- Initialize the CursorPartitionGenerator with a stream, sync mode, and cursor.
402
-
403
- :param stream: The stream to delegate to for partition generation.
404
- :param message_repository: The message repository to use to emit non-record messages.
405
- :param sync_mode: The synchronization mode.
406
- :param cursor: A Cursor object that maintains the state and the cursor field.
407
- """
408
- self._stream = stream
409
- self.message_repository = message_repository
410
- self._sync_mode = SyncMode.full_refresh
411
- self._cursor = cursor
412
- self._cursor_field = cursor_field
413
- self._state = self._cursor.state
414
- self._slice_boundary_fields = slice_boundary_fields
415
- self._connector_state_converter = connector_state_converter
416
-
417
- def generate(self) -> Iterable[Partition]:
418
- """
419
- Generate partitions based on the slices in the cursor's state.
420
-
421
- This method iterates through the list of slices found in the cursor's state, and for each slice, it generates
422
- a `StreamPartition` object.
423
-
424
- :return: An iterable of StreamPartition objects.
425
- """
426
-
427
- start_boundary = (
428
- self._slice_boundary_fields[self._START_BOUNDARY]
429
- if self._slice_boundary_fields
430
- else "start"
431
- )
432
- end_boundary = (
433
- self._slice_boundary_fields[self._END_BOUNDARY]
434
- if self._slice_boundary_fields
435
- else "end"
436
- )
437
-
438
- for slice_start, slice_end in self._cursor.generate_slices():
439
- stream_slice = StreamSlice(
440
- partition={},
441
- cursor_slice={
442
- start_boundary: self._connector_state_converter.output_format(slice_start),
443
- end_boundary: self._connector_state_converter.output_format(slice_end),
444
- },
445
- )
446
-
447
- yield StreamPartition(
448
- self._stream,
449
- copy.deepcopy(stream_slice),
450
- self.message_repository,
451
- self._sync_mode,
452
- self._cursor_field,
453
- self._state,
454
- self._cursor,
455
357
  )
456
358
 
457
359
 
@@ -11,9 +11,11 @@ from airbyte_cdk.sources.message import MessageRepository
11
11
  from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
12
12
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
13
13
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
14
+ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
14
15
  from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
15
16
  AbstractStreamStateConverter,
16
17
  )
18
+ from airbyte_cdk.sources.types import StreamSlice
17
19
 
18
20
 
19
21
  def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any:
@@ -61,7 +63,7 @@ class CursorField:
61
63
  return cursor_value # type: ignore # we assume that the value the path points at is a comparable
62
64
 
63
65
 
64
- class Cursor(ABC):
66
+ class Cursor(StreamSlicer, ABC):
65
67
  @property
66
68
  @abstractmethod
67
69
  def state(self) -> MutableMapping[str, Any]: ...
@@ -88,12 +90,12 @@ class Cursor(ABC):
88
90
  """
89
91
  raise NotImplementedError()
90
92
 
91
- def generate_slices(self) -> Iterable[Tuple[Any, Any]]:
93
+ def stream_slices(self) -> Iterable[StreamSlice]:
92
94
  """
93
95
  Default placeholder implementation of generate_slices.
94
96
  Subclasses can override this method to provide actual behavior.
95
97
  """
96
- yield from ()
98
+ yield StreamSlice(partition={}, cursor_slice={})
97
99
 
98
100
 
99
101
  class FinalStateCursor(Cursor):
@@ -184,8 +186,15 @@ class ConcurrentCursor(Cursor):
184
186
  return self._cursor_field
185
187
 
186
188
  @property
187
- def slice_boundary_fields(self) -> Optional[Tuple[str, str]]:
188
- return self._slice_boundary_fields
189
+ def _slice_boundary_fields_wrapper(self) -> Tuple[str, str]:
190
+ return (
191
+ self._slice_boundary_fields
192
+ if self._slice_boundary_fields
193
+ else (
194
+ self._connector_state_converter.START_KEY,
195
+ self._connector_state_converter.END_KEY,
196
+ )
197
+ )
189
198
 
190
199
  def _get_concurrent_state(
191
200
  self, state: MutableMapping[str, Any]
@@ -299,7 +308,7 @@ class ConcurrentCursor(Cursor):
299
308
  """
300
309
  self._emit_state_message()
301
310
 
302
- def generate_slices(self) -> Iterable[Tuple[CursorValueType, CursorValueType]]:
311
+ def stream_slices(self) -> Iterable[StreamSlice]:
303
312
  """
304
313
  Generating slices based on a few parameters:
305
314
  * lookback_window: Buffer to remove from END_KEY of the highest slice
@@ -368,7 +377,7 @@ class ConcurrentCursor(Cursor):
368
377
 
369
378
  def _split_per_slice_range(
370
379
  self, lower: CursorValueType, upper: CursorValueType, upper_is_end: bool
371
- ) -> Iterable[Tuple[CursorValueType, CursorValueType]]:
380
+ ) -> Iterable[StreamSlice]:
372
381
  if lower >= upper:
373
382
  return
374
383
 
@@ -377,10 +386,22 @@ class ConcurrentCursor(Cursor):
377
386
 
378
387
  lower = max(lower, self._start) if self._start else lower
379
388
  if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper:
380
- if self._cursor_granularity and not upper_is_end:
381
- yield lower, upper - self._cursor_granularity
382
- else:
383
- yield lower, upper
389
+ start_value, end_value = (
390
+ (lower, upper - self._cursor_granularity)
391
+ if self._cursor_granularity and not upper_is_end
392
+ else (lower, upper)
393
+ )
394
+ yield StreamSlice(
395
+ partition={},
396
+ cursor_slice={
397
+ self._slice_boundary_fields_wrapper[
398
+ self._START_BOUNDARY
399
+ ]: self._connector_state_converter.output_format(start_value),
400
+ self._slice_boundary_fields_wrapper[
401
+ self._END_BOUNDARY
402
+ ]: self._connector_state_converter.output_format(end_value),
403
+ },
404
+ )
384
405
  else:
385
406
  stop_processing = False
386
407
  current_lower_boundary = lower
@@ -389,12 +410,24 @@ class ConcurrentCursor(Cursor):
389
410
  self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper
390
411
  )
391
412
  has_reached_upper_boundary = current_upper_boundary >= upper
392
- if self._cursor_granularity and (
393
- not upper_is_end or not has_reached_upper_boundary
394
- ):
395
- yield current_lower_boundary, current_upper_boundary - self._cursor_granularity
396
- else:
397
- yield current_lower_boundary, current_upper_boundary
413
+
414
+ start_value, end_value = (
415
+ (current_lower_boundary, current_upper_boundary - self._cursor_granularity)
416
+ if self._cursor_granularity
417
+ and (not upper_is_end or not has_reached_upper_boundary)
418
+ else (current_lower_boundary, current_upper_boundary)
419
+ )
420
+ yield StreamSlice(
421
+ partition={},
422
+ cursor_slice={
423
+ self._slice_boundary_fields_wrapper[
424
+ self._START_BOUNDARY
425
+ ]: self._connector_state_converter.output_format(start_value),
426
+ self._slice_boundary_fields_wrapper[
427
+ self._END_BOUNDARY
428
+ ]: self._connector_state_converter.output_format(end_value),
429
+ },
430
+ )
398
431
  current_lower_boundary = current_upper_boundary
399
432
  if current_upper_boundary >= upper:
400
433
  stop_processing = True
@@ -40,21 +40,6 @@ class Partition(ABC):
40
40
  """
41
41
  pass
42
42
 
43
- @abstractmethod
44
- def close(self) -> None:
45
- """
46
- Closes the partition.
47
- """
48
- pass
49
-
50
- @abstractmethod
51
- def is_closed(self) -> bool:
52
- """
53
- Returns whether the partition is closed.
54
- :return:
55
- """
56
- pass
57
-
58
43
  @abstractmethod
59
44
  def __hash__(self) -> int:
60
45
  """
@@ -0,0 +1,21 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Iterable
5
+
6
+ from airbyte_cdk.sources.types import StreamSlice
7
+
8
+
9
+ class StreamSlicer(ABC):
10
+ """
11
+ Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.
12
+ """
13
+
14
+ @abstractmethod
15
+ def stream_slices(self) -> Iterable[StreamSlice]:
16
+ """
17
+ Defines stream slices
18
+
19
+ :return: An iterable of stream slices
20
+ """
21
+ pass
@@ -124,6 +124,13 @@ class AbstractStreamStateConverter(ABC):
124
124
  """
125
125
  ...
126
126
 
127
+ @abstractmethod
128
+ def output_format(self, value: Any) -> Any:
129
+ """
130
+ Convert the cursor value type to a JSON valid type.
131
+ """
132
+ ...
133
+
127
134
  def merge_intervals(
128
135
  self, intervals: List[MutableMapping[str, Any]]
129
136
  ) -> List[MutableMapping[str, Any]]:
@@ -82,7 +82,11 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
82
82
  # The start and end are the same to avoid confusion as to whether the records for this slice
83
83
  # were actually synced
84
84
  slices = [
85
- {self.START_KEY: start if start is not None else sync_start, self.END_KEY: sync_start}
85
+ {
86
+ self.START_KEY: start if start is not None else sync_start,
87
+ self.END_KEY: sync_start,
88
+ self.MOST_RECENT_RECORD_KEY: sync_start,
89
+ }
86
90
  ]
87
91
 
88
92
  return sync_start, {
@@ -0,0 +1,30 @@
1
+ import hashlib
2
+ import json
3
+ from typing import Any, Mapping, Optional, Final
4
+
5
+
6
+ class SliceEncoder(json.JSONEncoder):
7
+ def default(self, obj: Any) -> Any:
8
+ if hasattr(obj, "__json_serializable__"):
9
+ return obj.__json_serializable__()
10
+
11
+ # Let the base class default method raise the TypeError
12
+ return super().default(obj)
13
+
14
+
15
+ class SliceHasher:
16
+ _ENCODING: Final = "utf-8"
17
+
18
+ @classmethod
19
+ def hash(cls, stream_name: str, stream_slice: Optional[Mapping[str, Any]] = None) -> int:
20
+ if stream_slice:
21
+ try:
22
+ s = json.dumps(stream_slice, sort_keys=True, cls=SliceEncoder)
23
+ hash_input = f"{stream_name}:{s}".encode(cls._ENCODING)
24
+ except TypeError as e:
25
+ raise ValueError(f"Failed to serialize stream slice: {e}")
26
+ else:
27
+ hash_input = stream_name.encode(cls._ENCODING)
28
+
29
+ # Use last 8 bytes as 64-bit integer for better distribution
30
+ return int.from_bytes(hashlib.sha256(hash_input).digest()[-8:], "big")