PyPI - airbyte-cdk - Versions diffs - 6.6.0rc1__py3-none-any.whl → 6.6.2__py3-none-any.whl - Mend

airbyte-cdk 6.6.0rc1py3-none-any.whl → 6.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of airbyte-cdk might be problematic. Click here for more details.

Files changed (25) hide show

airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py CHANGED Viewed

@@ -2,18 +2,17 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
-from abc import abstractmethod
-from dataclasses import dataclass
-from typing import Iterable
+from abc import ABC
 from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import (
     RequestOptionsProvider,
 )
-from airbyte_cdk.sources.types import StreamSlice
+from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import (
+    StreamSlicer as ConcurrentStreamSlicer,
+)
-@dataclass
-class StreamSlicer(RequestOptionsProvider):
+class StreamSlicer(ConcurrentStreamSlicer, RequestOptionsProvider, ABC):
     """
     Slices the stream into a subset of records.
     Slices enable state checkpointing and data retrieval parallelization.
@@ -23,10 +22,4 @@ class StreamSlicer(RequestOptionsProvider):
     See the stream slicing section of the docs for more information.
     """
-    @abstractmethod
-    def stream_slices(self) -> Iterable[StreamSlice]:
-        """
-        Defines stream slices
-        :return: List of stream slices
-        """
+    pass

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py CHANGED Viewed

@@ -29,16 +29,25 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
 from airbyte_cdk.utils import is_cloud_environment
 from airbyte_cdk.utils.traced_exception import AirbyteTracedException
 from unstructured.file_utils.filetype import (
+    EXT_TO_FILETYPE,
     FILETYPE_TO_MIMETYPE,
     STR_TO_FILETYPE,
     FileType,
     detect_filetype,
 )
+import nltk
 unstructured_partition_pdf = None
 unstructured_partition_docx = None
 unstructured_partition_pptx = None
+try:
+    nltk.data.find("tokenizers/punkt.zip")
+    nltk.data.find("tokenizers/punkt_tab.zip")
+except LookupError:
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
 def optional_decode(contents: Union[str, bytes]) -> str:
     if isinstance(contents, bytes):
@@ -108,9 +117,11 @@ class UnstructuredParser(FileTypeParser):
         format = _extract_format(config)
         with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
             filetype = self._get_filetype(file_handle, file)
             if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
-                raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
+                raise self._create_parse_error(
+                    file,
+                    self._get_file_type_error_message(filetype),
+                )
             return {
                 "content": {
@@ -159,6 +170,10 @@ class UnstructuredParser(FileTypeParser):
                     logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
                 else:
                     raise e
+            except Exception as e:
+                exception_str = str(e)
+                logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
+                raise e
     def _read_file(
         self,
@@ -176,20 +191,32 @@ class UnstructuredParser(FileTypeParser):
             # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
             raise Exception("unstructured library is not available")
-        filetype = self._get_filetype(file_handle, remote_file)
+        filetype: FileType | None = self._get_filetype(file_handle, remote_file)
-        if filetype == FileType.MD or filetype == FileType.TXT:
+        if filetype is None or filetype not in self._supported_file_types():
+            raise self._create_parse_error(
+                remote_file,
+                self._get_file_type_error_message(filetype),
+            )
+        if filetype in {FileType.MD, FileType.TXT}:
             file_content: bytes = file_handle.read()
             decoded_content: str = optional_decode(file_content)
             return decoded_content
-        if filetype not in self._supported_file_types():
-            raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
         if format.processing.mode == "local":
-            return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
+            return self._read_file_locally(
+                file_handle,
+                filetype,
+                format.strategy,
+                remote_file,
+            )
         elif format.processing.mode == "api":
             try:
                 result: str = self._read_file_remotely_with_retries(
-                    file_handle, format.processing, filetype, format.strategy, remote_file
+                    file_handle,
+                    format.processing,
+                    filetype,
+                    format.strategy,
+                    remote_file,
                 )
             except Exception as e:
                 # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
@@ -336,7 +363,11 @@ class UnstructuredParser(FileTypeParser):
         return self._render_markdown([element.to_dict() for element in elements])
-    def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
+    def _create_parse_error(
+        self,
+        remote_file: RemoteFile,
+        message: str,
+    ) -> RecordParseError:
         return RecordParseError(
             FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
         )
@@ -360,32 +391,51 @@ class UnstructuredParser(FileTypeParser):
         # detect_filetype is either using the file name or file content
         # if possible, try to leverage the file name to detect the file type
         # if the file name is not available, use the file content
-        file_type = detect_filetype(
-            filename=remote_file.uri,
-        )
-        if file_type is not None and not file_type == FileType.UNK:
+        file_type: FileType | None = None
+        try:
+            file_type = detect_filetype(
+                filename=remote_file.uri,
+            )
+        except Exception:
+            # Path doesn't exist locally. Try something else...
+            pass
+        if file_type and file_type != FileType.UNK:
             return file_type
         type_based_on_content = detect_filetype(file=file)
+        file.seek(0)  # detect_filetype is reading to read the file content, so we need to reset
-        # detect_filetype is reading to read the file content
-        file.seek(0)
+        if type_based_on_content and type_based_on_content != FileType.UNK:
+            return type_based_on_content
-        return type_based_on_content
+        extension = "." + remote_file.uri.split(".")[-1].lower()
+        if extension in EXT_TO_FILETYPE:
+            return EXT_TO_FILETYPE[extension]
+        return None
     def _supported_file_types(self) -> List[Any]:
         return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
-    def _get_file_type_error_message(self, file_type: FileType) -> str:
+    def _get_file_type_error_message(
+        self,
+        file_type: FileType | None,
+    ) -> str:
         supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
-        return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
+        return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
     def _render_markdown(self, elements: List[Any]) -> str:
         return "\n\n".join((self._convert_to_markdown(el) for el in elements))
     def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
         if dpath.get(el, "type") == "Title":
-            heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
+            category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
+            if not isinstance(category_depth, int):
+                category_depth = (
+                    int(category_depth) if isinstance(category_depth, (str, float)) else 1
+                )
+            heading_str = "#" * category_depth
             return f"{heading_str} {dpath.get(el, 'text')}"
         elif dpath.get(el, "type") == "ListItem":
             return f"- {dpath.get(el, 'text')}"

airbyte_cdk/sources/file_based/stream/concurrent/adapters.py CHANGED Viewed

@@ -226,7 +226,6 @@ class FileBasedStreamPartition(Partition):
         sync_mode: SyncMode,
         cursor_field: Optional[List[str]],
         state: Optional[MutableMapping[str, Any]],
-        cursor: "AbstractConcurrentFileBasedCursor",
     ):
         self._stream = stream
         self._slice = _slice
@@ -234,8 +233,6 @@ class FileBasedStreamPartition(Partition):
         self._sync_mode = sync_mode
         self._cursor_field = cursor_field
         self._state = state
-        self._cursor = cursor
-        self._is_closed = False
     def read(self) -> Iterable[Record]:
         try:
@@ -289,13 +286,6 @@ class FileBasedStreamPartition(Partition):
         file = self._slice["files"][0]
         return {"files": [file]}
-    def close(self) -> None:
-        self._cursor.close_partition(self)
-        self._is_closed = True
-    def is_closed(self) -> bool:
-        return self._is_closed
     def __hash__(self) -> int:
         if self._slice:
             # Convert the slice to a string so that it can be hashed
@@ -352,7 +342,6 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
                             self._sync_mode,
                             self._cursor_field,
                             self._state,
-                            self._cursor,
                         )
                     )
         self._cursor.set_pending_partitions(pending_partitions)

airbyte_cdk/sources/streams/concurrent/adapters.py CHANGED Viewed

@@ -38,15 +38,13 @@ from airbyte_cdk.sources.streams.concurrent.helpers import (
 from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
 from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
 from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
-from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
-    DateTimeStreamStateConverter,
-)
 from airbyte_cdk.sources.streams.core import StreamData
-from airbyte_cdk.sources.types import StreamSlice
 from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
 from airbyte_cdk.sources.utils.slice_logger import SliceLogger
 from deprecated.classic import deprecated
+from airbyte_cdk.utils.slice_hasher import SliceHasher
 """
 This module contains adapters to help enabling concurrency on Stream objects without needing to migrate to AbstractStream
 """
@@ -96,7 +94,6 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
                     else SyncMode.incremental,
                     [cursor_field] if cursor_field is not None else None,
                     state,
-                    cursor,
                 ),
                 name=stream.name,
                 namespace=stream.namespace,
@@ -259,7 +256,6 @@ class StreamPartition(Partition):
         sync_mode: SyncMode,
         cursor_field: Optional[List[str]],
         state: Optional[MutableMapping[str, Any]],
-        cursor: Cursor,
     ):
         """
         :param stream: The stream to delegate to
@@ -272,8 +268,7 @@ class StreamPartition(Partition):
         self._sync_mode = sync_mode
         self._cursor_field = cursor_field
         self._state = state
-        self._cursor = cursor
-        self._is_closed = False
+        self._hash = SliceHasher.hash(self._stream.name, self._slice)
     def read(self) -> Iterable[Record]:
         """
@@ -313,23 +308,11 @@ class StreamPartition(Partition):
         return self._slice
     def __hash__(self) -> int:
-        if self._slice:
-            # Convert the slice to a string so that it can be hashed
-            s = json.dumps(self._slice, sort_keys=True, cls=SliceEncoder)
-            return hash((self._stream.name, s))
-        else:
-            return hash(self._stream.name)
+        return self._hash
     def stream_name(self) -> str:
         return self._stream.name
-    def close(self) -> None:
-        self._cursor.close_partition(self)
-        self._is_closed = True
-    def is_closed(self) -> bool:
-        return self._is_closed
     def __repr__(self) -> str:
         return f"StreamPartition({self._stream.name}, {self._slice})"
@@ -349,7 +332,6 @@ class StreamPartitionGenerator(PartitionGenerator):
         sync_mode: SyncMode,
         cursor_field: Optional[List[str]],
         state: Optional[MutableMapping[str, Any]],
-        cursor: Cursor,
     ):
         """
         :param stream: The stream to delegate to
@@ -360,7 +342,6 @@ class StreamPartitionGenerator(PartitionGenerator):
         self._sync_mode = sync_mode
         self._cursor_field = cursor_field
         self._state = state
-        self._cursor = cursor
     def generate(self) -> Iterable[Partition]:
         for s in self._stream.stream_slices(
@@ -373,85 +354,6 @@ class StreamPartitionGenerator(PartitionGenerator):
                 self._sync_mode,
                 self._cursor_field,
                 self._state,
-                self._cursor,
-            )
-class CursorPartitionGenerator(PartitionGenerator):
-    """
-    This class generates partitions using the concurrent cursor and iterates through state slices to generate partitions.
-    It is used when synchronizing a stream in incremental or full-refresh mode where state information is maintained
-    across partitions. Each partition represents a subset of the stream's data and is determined by the cursor's state.
-    """
-    _START_BOUNDARY = 0
-    _END_BOUNDARY = 1
-    def __init__(
-        self,
-        stream: Stream,
-        message_repository: MessageRepository,
-        cursor: Cursor,
-        connector_state_converter: DateTimeStreamStateConverter,
-        cursor_field: Optional[List[str]],
-        slice_boundary_fields: Optional[Tuple[str, str]],
-    ):
-        """
-        Initialize the CursorPartitionGenerator with a stream, sync mode, and cursor.
-        :param stream: The stream to delegate to for partition generation.
-        :param message_repository: The message repository to use to emit non-record messages.
-        :param sync_mode: The synchronization mode.
-        :param cursor: A Cursor object that maintains the state and the cursor field.
-        """
-        self._stream = stream
-        self.message_repository = message_repository
-        self._sync_mode = SyncMode.full_refresh
-        self._cursor = cursor
-        self._cursor_field = cursor_field
-        self._state = self._cursor.state
-        self._slice_boundary_fields = slice_boundary_fields
-        self._connector_state_converter = connector_state_converter
-    def generate(self) -> Iterable[Partition]:
-        """
-        Generate partitions based on the slices in the cursor's state.
-        This method iterates through the list of slices found in the cursor's state, and for each slice, it generates
-        a `StreamPartition` object.
-        :return: An iterable of StreamPartition objects.
-        """
-        start_boundary = (
-            self._slice_boundary_fields[self._START_BOUNDARY]
-            if self._slice_boundary_fields
-            else "start"
-        )
-        end_boundary = (
-            self._slice_boundary_fields[self._END_BOUNDARY]
-            if self._slice_boundary_fields
-            else "end"
-        )
-        for slice_start, slice_end in self._cursor.generate_slices():
-            stream_slice = StreamSlice(
-                partition={},
-                cursor_slice={
-                    start_boundary: self._connector_state_converter.output_format(slice_start),
-                    end_boundary: self._connector_state_converter.output_format(slice_end),
-                },
-            )
-            yield StreamPartition(
-                self._stream,
-                copy.deepcopy(stream_slice),
-                self.message_repository,
-                self._sync_mode,
-                self._cursor_field,
-                self._state,
-                self._cursor,
             )

airbyte_cdk/sources/streams/concurrent/cursor.py CHANGED Viewed

@@ -11,9 +11,11 @@ from airbyte_cdk.sources.message import MessageRepository
 from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
 from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
 from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
+from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
 from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
     AbstractStreamStateConverter,
 )
+from airbyte_cdk.sources.types import StreamSlice
 def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any:
@@ -61,7 +63,7 @@ class CursorField:
         return cursor_value  # type: ignore  # we assume that the value the path points at is a comparable
-class Cursor(ABC):
+class Cursor(StreamSlicer, ABC):
     @property
     @abstractmethod
     def state(self) -> MutableMapping[str, Any]: ...
@@ -88,12 +90,12 @@ class Cursor(ABC):
         """
         raise NotImplementedError()
-    def generate_slices(self) -> Iterable[Tuple[Any, Any]]:
+    def stream_slices(self) -> Iterable[StreamSlice]:
         """
         Default placeholder implementation of generate_slices.
         Subclasses can override this method to provide actual behavior.
         """
-        yield from ()
+        yield StreamSlice(partition={}, cursor_slice={})
 class FinalStateCursor(Cursor):
@@ -184,8 +186,15 @@ class ConcurrentCursor(Cursor):
         return self._cursor_field
     @property
-    def slice_boundary_fields(self) -> Optional[Tuple[str, str]]:
-        return self._slice_boundary_fields
+    def _slice_boundary_fields_wrapper(self) -> Tuple[str, str]:
+        return (
+            self._slice_boundary_fields
+            if self._slice_boundary_fields
+            else (
+                self._connector_state_converter.START_KEY,
+                self._connector_state_converter.END_KEY,
+            )
+        )
     def _get_concurrent_state(
         self, state: MutableMapping[str, Any]
@@ -299,7 +308,7 @@ class ConcurrentCursor(Cursor):
         """
         self._emit_state_message()
-    def generate_slices(self) -> Iterable[Tuple[CursorValueType, CursorValueType]]:
+    def stream_slices(self) -> Iterable[StreamSlice]:
         """
         Generating slices based on a few parameters:
         * lookback_window: Buffer to remove from END_KEY of the highest slice
@@ -368,7 +377,7 @@ class ConcurrentCursor(Cursor):
     def _split_per_slice_range(
         self, lower: CursorValueType, upper: CursorValueType, upper_is_end: bool
-    ) -> Iterable[Tuple[CursorValueType, CursorValueType]]:
+    ) -> Iterable[StreamSlice]:
         if lower >= upper:
             return
@@ -377,10 +386,22 @@ class ConcurrentCursor(Cursor):
         lower = max(lower, self._start) if self._start else lower
         if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper:
-            if self._cursor_granularity and not upper_is_end:
-                yield lower, upper - self._cursor_granularity
-            else:
-                yield lower, upper
+            start_value, end_value = (
+                (lower, upper - self._cursor_granularity)
+                if self._cursor_granularity and not upper_is_end
+                else (lower, upper)
+            )
+            yield StreamSlice(
+                partition={},
+                cursor_slice={
+                    self._slice_boundary_fields_wrapper[
+                        self._START_BOUNDARY
+                    ]: self._connector_state_converter.output_format(start_value),
+                    self._slice_boundary_fields_wrapper[
+                        self._END_BOUNDARY
+                    ]: self._connector_state_converter.output_format(end_value),
+                },
+            )
         else:
             stop_processing = False
             current_lower_boundary = lower
@@ -389,12 +410,24 @@ class ConcurrentCursor(Cursor):
                     self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper
                 )
                 has_reached_upper_boundary = current_upper_boundary >= upper
-                if self._cursor_granularity and (
-                    not upper_is_end or not has_reached_upper_boundary
-                ):
-                    yield current_lower_boundary, current_upper_boundary - self._cursor_granularity
-                else:
-                    yield current_lower_boundary, current_upper_boundary
+                start_value, end_value = (
+                    (current_lower_boundary, current_upper_boundary - self._cursor_granularity)
+                    if self._cursor_granularity
+                    and (not upper_is_end or not has_reached_upper_boundary)
+                    else (current_lower_boundary, current_upper_boundary)
+                )
+                yield StreamSlice(
+                    partition={},
+                    cursor_slice={
+                        self._slice_boundary_fields_wrapper[
+                            self._START_BOUNDARY
+                        ]: self._connector_state_converter.output_format(start_value),
+                        self._slice_boundary_fields_wrapper[
+                            self._END_BOUNDARY
+                        ]: self._connector_state_converter.output_format(end_value),
+                    },
+                )
                 current_lower_boundary = current_upper_boundary
                 if current_upper_boundary >= upper:
                     stop_processing = True

airbyte_cdk/sources/streams/concurrent/partitions/partition.py CHANGED Viewed

@@ -40,21 +40,6 @@ class Partition(ABC):
         """
         pass
-    @abstractmethod
-    def close(self) -> None:
-        """
-        Closes the partition.
-        """
-        pass
-    @abstractmethod
-    def is_closed(self) -> bool:
-        """
-        Returns whether the partition is closed.
-        :return:
-        """
-        pass
     @abstractmethod
     def __hash__(self) -> int:
         """

airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py ADDED Viewed

@@ -0,0 +1,21 @@
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+from abc import ABC, abstractmethod
+from typing import Iterable
+from airbyte_cdk.sources.types import StreamSlice
+class StreamSlicer(ABC):
+    """
+    Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.
+    """
+    @abstractmethod
+    def stream_slices(self) -> Iterable[StreamSlice]:
+        """
+        Defines stream slices
+        :return: An iterable of stream slices
+        """
+        pass

airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py CHANGED Viewed

@@ -124,6 +124,13 @@ class AbstractStreamStateConverter(ABC):
         """
         ...
+    @abstractmethod
+    def output_format(self, value: Any) -> Any:
+        """
+        Convert the cursor value type to a JSON valid type.
+        """
+        ...
     def merge_intervals(
         self, intervals: List[MutableMapping[str, Any]]
     ) -> List[MutableMapping[str, Any]]:

airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py CHANGED Viewed

@@ -82,7 +82,11 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
         # The start and end are the same to avoid confusion as to whether the records for this slice
         # were actually synced
         slices = [
-            {self.START_KEY: start if start is not None else sync_start, self.END_KEY: sync_start}
+            {
+                self.START_KEY: start if start is not None else sync_start,
+                self.END_KEY: sync_start,
+                self.MOST_RECENT_RECORD_KEY: sync_start,
+            }
         ]
         return sync_start, {

airbyte_cdk/utils/slice_hasher.py ADDED Viewed

@@ -0,0 +1,30 @@
+import hashlib
+import json
+from typing import Any, Mapping, Optional, Final
+class SliceEncoder(json.JSONEncoder):
+    def default(self, obj: Any) -> Any:
+        if hasattr(obj, "__json_serializable__"):
+            return obj.__json_serializable__()
+        # Let the base class default method raise the TypeError
+        return super().default(obj)
+class SliceHasher:
+    _ENCODING: Final = "utf-8"
+    @classmethod
+    def hash(cls, stream_name: str, stream_slice: Optional[Mapping[str, Any]] = None) -> int:
+        if stream_slice:
+            try:
+                s = json.dumps(stream_slice, sort_keys=True, cls=SliceEncoder)
+                hash_input = f"{stream_name}:{s}".encode(cls._ENCODING)
+            except TypeError as e:
+                raise ValueError(f"Failed to serialize stream slice: {e}")
+        else:
+            hash_input = stream_name.encode(cls._ENCODING)
+        # Use last 8 bytes as 64-bit integer for better distribution
+        return int.from_bytes(hashlib.sha256(hash_input).digest()[-8:], "big")

airbyte-cdk 6.6.0rc1__py3-none-any.whl → 6.6.2__py3-none-any.whl

Potentially problematic release.

airbyte-cdk 6.6.0rc1py3-none-any.whl → 6.6.2py3-none-any.whl