PyPI - airbyte-cdk - Versions diffs - 6.18.0.dev3__py3-none-any.whl → 6.18.2__py3-none-any.whl - Mend

airbyte-cdk 6.18.0.dev3py3-none-any.whl → 6.18.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py CHANGED Viewed

@@ -72,8 +72,6 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
     CsvParser,
     GzipParser,
     JsonLineParser,
-    JsonParser,
-    Parser,
 )
 from airbyte_cdk.sources.declarative.extractors import (
     DpathExtractor,
@@ -249,9 +247,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
 from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
     JsonLineParser as JsonLineParserModel,
 )
-from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
-    JsonParser as JsonParserModel,
-)
 from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
     JwtAuthenticator as JwtAuthenticatorModel,
 )
@@ -527,7 +522,6 @@ class ModelToComponentFactory:
             JsonDecoderModel: self.create_json_decoder,
             JsonlDecoderModel: self.create_jsonl_decoder,
             JsonLineParserModel: self.create_json_line_parser,
-            JsonParserModel: self.create_json_parser,
             GzipJsonDecoderModel: self.create_gzipjson_decoder,
             GzipParserModel: self.create_gzip_parser,
             KeysToLowerModel: self.create_keys_to_lower_transformation,
@@ -1038,17 +1032,17 @@ class ModelToComponentFactory:
         self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any
     ) -> CursorPaginationStrategy:
         if isinstance(decoder, PaginationDecoderDecorator):
-            inner_decoder = decoder.decoder
-        else:
-            inner_decoder = decoder
-            decoder = PaginationDecoderDecorator(decoder=decoder)
-        if self._is_supported_decoder_for_pagination(inner_decoder):
+            if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
+                raise ValueError(
+                    f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
+                )
             decoder_to_use = decoder
         else:
-            raise ValueError(
-                self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder))
-            )
+            if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
+                raise ValueError(
+                    f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
+                )
+            decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
         return CursorPaginationStrategy(
             cursor_value=model.cursor_value,
@@ -1521,10 +1515,11 @@ class ModelToComponentFactory:
         cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None,
     ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]:
         if decoder:
-            if self._is_supported_decoder_for_pagination(decoder):
-                decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
-            else:
-                raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder)))
+            if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
+                raise ValueError(
+                    f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
+                )
+            decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
         else:
             decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
         page_size_option = (
@@ -1753,11 +1748,6 @@ class ModelToComponentFactory:
     def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
         return JsonDecoder(parameters={})
-    @staticmethod
-    def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
-        encoding = model.encoding if model.encoding else "utf-8"
-        return JsonParser(encoding=encoding)
     @staticmethod
     def create_jsonl_decoder(
         model: JsonlDecoderModel, config: Config, **kwargs: Any
@@ -1895,15 +1885,24 @@ class ModelToComponentFactory:
                 expires_in_name=InterpolatedString.create(
                     model.expires_in_name or "expires_in", parameters=model.parameters or {}
                 ).eval(config),
+                client_id_name=InterpolatedString.create(
+                    model.client_id_name or "client_id", parameters=model.parameters or {}
+                ).eval(config),
                 client_id=InterpolatedString.create(
                     model.client_id, parameters=model.parameters or {}
                 ).eval(config),
+                client_secret_name=InterpolatedString.create(
+                    model.client_secret_name or "client_secret", parameters=model.parameters or {}
+                ).eval(config),
                 client_secret=InterpolatedString.create(
                     model.client_secret, parameters=model.parameters or {}
                 ).eval(config),
                 access_token_config_path=model.refresh_token_updater.access_token_config_path,
                 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path,
                 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path,
+                grant_type_name=InterpolatedString.create(
+                    model.grant_type_name or "grant_type", parameters=model.parameters or {}
+                ).eval(config),
                 grant_type=InterpolatedString.create(
                     model.grant_type or "refresh_token", parameters=model.parameters or {}
                 ).eval(config),
@@ -1921,11 +1920,15 @@ class ModelToComponentFactory:
         return DeclarativeOauth2Authenticator(  # type: ignore
             access_token_name=model.access_token_name or "access_token",
             access_token_value=model.access_token_value,
+            client_id_name=model.client_id_name or "client_id",
             client_id=model.client_id,
+            client_secret_name=model.client_secret_name or "client_secret",
             client_secret=model.client_secret,
             expires_in_name=model.expires_in_name or "expires_in",
+            grant_type_name=model.grant_type_name or "grant_type",
             grant_type=model.grant_type or "refresh_token",
             refresh_request_body=model.refresh_request_body,
+            refresh_token_name=model.refresh_token_name or "refresh_token",
             refresh_token=model.refresh_token,
             scopes=model.scopes,
             token_expiry_date=model.token_expiry_date,
@@ -1937,22 +1940,22 @@ class ModelToComponentFactory:
             message_repository=self._message_repository,
         )
+    @staticmethod
     def create_offset_increment(
-        self, model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
+        model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
     ) -> OffsetIncrement:
         if isinstance(decoder, PaginationDecoderDecorator):
-            inner_decoder = decoder.decoder
-        else:
-            inner_decoder = decoder
-            decoder = PaginationDecoderDecorator(decoder=decoder)
-        if self._is_supported_decoder_for_pagination(inner_decoder):
+            if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
+                raise ValueError(
+                    f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
+                )
             decoder_to_use = decoder
         else:
-            raise ValueError(
-                self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder))
-            )
+            if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
+                raise ValueError(
+                    f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
+                )
+            decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
         return OffsetIncrement(
             page_size=model.page_size,
             config=config,
@@ -2297,7 +2300,7 @@ class ModelToComponentFactory:
                 extractor=download_extractor,
                 name=name,
                 record_filter=None,
-                transformations=[],
+                transformations=transformations,
                 schema_normalization=TypeTransformer(TransformConfig.NoTransform),
                 config=config,
                 parameters={},
@@ -2334,6 +2337,16 @@ class ModelToComponentFactory:
             if model.delete_requester
             else None
         )
+        url_requester = (
+            self._create_component_from_model(
+                model=model.url_requester,
+                decoder=decoder,
+                config=config,
+                name=f"job extract_url - {name}",
+            )
+            if model.url_requester
+            else None
+        )
         status_extractor = self._create_component_from_model(
             model=model.status_extractor, decoder=decoder, config=config, name=name
         )
@@ -2344,6 +2357,7 @@ class ModelToComponentFactory:
             creation_requester=creation_requester,
             polling_requester=polling_requester,
             download_retriever=download_retriever,
+            url_requester=url_requester,
             abort_requester=abort_requester,
             delete_requester=delete_requester,
             status_extractor=status_extractor,
@@ -2541,25 +2555,3 @@ class ModelToComponentFactory:
             components_mapping=components_mapping,
             parameters=model.parameters or {},
         )
-    _UNSUPPORTED_DECODER_ERROR = (
-        "Specified decoder of {decoder_type} is not supported for pagination."
-        "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead."
-        "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`."
-    )
-    def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool:
-        if isinstance(decoder, (JsonDecoder, XmlDecoder)):
-            return True
-        elif isinstance(decoder, CompositeRawDecoder):
-            return self._is_supported_parser_for_pagination(decoder.parser)
-        else:
-            return False
-    def _is_supported_parser_for_pagination(self, parser: Parser) -> bool:
-        if isinstance(parser, JsonParser):
-            return True
-        elif isinstance(parser, GzipParser):
-            return isinstance(parser.inner_parser, JsonParser)
-        else:
-            return False

airbyte_cdk/sources/declarative/requesters/README.md ADDED Viewed

@@ -0,0 +1,57 @@
+# AsyncHttpJobRepository sequence diagram
+- Components marked as optional are not required and can be ignored.
+- if `url_requester` is not provided, `urls_extractor` will get urls from the `polling_job_response`
+- interpolation_context, e.g. `create_job_response` or `polling_job_response` can be obtained from stream_slice
+```mermaid
+---
+title: AsyncHttpJobRepository Sequence Diagram
+---
+sequenceDiagram
+    participant AsyncHttpJobRepository as AsyncOrchestrator
+    participant CreationRequester as creation_requester
+    participant PollingRequester as polling_requester
+    participant UrlRequester as url_requester (Optional)
+    participant DownloadRetriever as download_retriever
+    participant AbortRequester as abort_requester (Optional)
+    participant DeleteRequester as delete_requester (Optional)
+    participant Reporting Server as Async Reporting Server
+    AsyncHttpJobRepository ->> CreationRequester: Initiate job creation
+    CreationRequester ->> Reporting Server: Create job request
+    Reporting Server -->> CreationRequester: Job ID response
+    CreationRequester -->> AsyncHttpJobRepository: Job ID
+    loop Poll for job status
+        AsyncHttpJobRepository ->> PollingRequester: Check job status
+        PollingRequester ->> Reporting Server: Status request (interpolation_context: `create_job_response`)
+        Reporting Server -->> PollingRequester: Status response
+        PollingRequester -->> AsyncHttpJobRepository: Job status
+    end
+    alt Status: Ready
+        AsyncHttpJobRepository ->> UrlRequester: Request download URLs (if applicable)
+        UrlRequester ->> Reporting Server: URL request (interpolation_context: `polling_job_response`)
+        Reporting Server -->> UrlRequester: Download URLs
+        UrlRequester -->> AsyncHttpJobRepository: Download URLs
+        AsyncHttpJobRepository ->> DownloadRetriever: Download reports
+        DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
+        Reporting Server -->> DownloadRetriever: Report data
+        DownloadRetriever -->> AsyncHttpJobRepository: Report data
+    else Status: Failed
+        AsyncHttpJobRepository ->> AbortRequester: Send abort request
+        AbortRequester ->> Reporting Server: Abort job
+        Reporting Server -->> AbortRequester: Abort confirmation
+        AbortRequester -->> AsyncHttpJobRepository: Confirmation
+    end
+    AsyncHttpJobRepository ->> DeleteRequester: Send delete job request
+    DeleteRequester ->> Reporting Server: Delete job
+    Reporting Server -->> DeleteRequester: Deletion confirmation
+    DeleteRequester -->> AsyncHttpJobRepository: Confirmation
+```

airbyte_cdk/sources/declarative/requesters/http_job_repository.py CHANGED Viewed

@@ -31,6 +31,10 @@ LOGGER = logging.getLogger("airbyte")
 @dataclass
 class AsyncHttpJobRepository(AsyncJobRepository):
+    """
+    See Readme file for more details about flow.
+    """
     creation_requester: Requester
     polling_requester: Requester
     download_retriever: SimpleRetriever
@@ -44,6 +48,9 @@ class AsyncHttpJobRepository(AsyncJobRepository):
     record_extractor: RecordExtractor = field(
         init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
     )
+    url_requester: Optional[Requester] = (
+        None  # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
+    )
     def __post_init__(self) -> None:
         self._create_job_response_by_id: Dict[str, Response] = {}
@@ -186,10 +193,13 @@ class AsyncHttpJobRepository(AsyncJobRepository):
         """
-        for url in self.urls_extractor.extract_records(
-            self._polling_job_response_by_id[job.api_job_id()]
-        ):
-            stream_slice: StreamSlice = StreamSlice(partition={"url": url}, cursor_slice={})
+        for url in self._get_download_url(job):
+            job_slice = job.job_parameters()
+            stream_slice = StreamSlice(
+                partition=job_slice.partition,
+                cursor_slice=job_slice.cursor_slice,
+                extra_fields={**job_slice.extra_fields, "url": url},
+            )
             for message in self.download_retriever.read_records({}, stream_slice):
                 if isinstance(message, Record):
                     yield message.data
@@ -226,3 +236,22 @@ class AsyncHttpJobRepository(AsyncJobRepository):
             cursor_slice={},
         )
         return stream_slice
+    def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
+        if not self.url_requester:
+            url_response = self._polling_job_response_by_id[job.api_job_id()]
+        else:
+            stream_slice: StreamSlice = StreamSlice(
+                partition={
+                    "polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
+                },
+                cursor_slice={},
+            )
+            url_response = self.url_requester.send_request(stream_slice=stream_slice)  # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
+            if not url_response:
+                raise AirbyteTracedException(
+                    internal_message="Always expect a response or an exception from url_requester",
+                    failure_type=FailureType.system_error,
+                )
+        yield from self.urls_extractor.extract_records(url_response)  # type: ignore # we expect urls_extractor to always return list of strings

airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py CHANGED Viewed

@@ -31,6 +31,17 @@ class DeliverRawFiles(BaseModel):
     delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
+    preserve_directory_structure: bool = Field(
+        title="Preserve Sub-Directories in File Paths",
+        description=(
+            "If enabled, sends subdirectory folder structure "
+            "along with source file names to the destination. "
+            "Otherwise, files will be synced by their names only. "
+            "This option is ignored when file-based replication is not enabled."
+        ),
+        default=True,
+    )
 class AbstractFileBasedSpec(BaseModel):
     """

airbyte_cdk/sources/file_based/exceptions.py CHANGED Viewed

@@ -111,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
     pass
+class DuplicatedFilesError(BaseFileBasedSourceError):
+    def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
+        self._duplicated_files_names = duplicated_files_names
+        self._stream_name: str = kwargs["stream"]
+        super().__init__(self._format_duplicate_files_error_message(), **kwargs)
+    def _format_duplicate_files_error_message(self) -> str:
+        duplicated_files_messages = []
+        for duplicated_file in self._duplicated_files_names:
+            for duplicated_file_name, file_paths in duplicated_file.items():
+                file_duplicated_message = (
+                    f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
+                    + "".join(f"\n - {file_paths}")
+                )
+                duplicated_files_messages.append(file_duplicated_message)
+        error_message = (
+            f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
+            "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
+            "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
+            + "\n".join(duplicated_files_messages)
+        )
+        return error_message
+    def __repr__(self) -> str:
+        """Return a string representation of the exception."""
+        class_name = self.__class__.__name__
+        properties_str = ", ".join(
+            f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
+        )
+        return f"{class_name}({properties_str})"
 class CustomFileBasedException(AirbyteTracedException):
     """
     A specialized exception for file-based connectors.

airbyte_cdk/sources/file_based/file_based_source.py CHANGED Viewed

@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
                         stream=self._make_default_stream(
                             stream_config=stream_config,
                             cursor=cursor,
-                            use_file_transfer=self._use_file_transfer(parsed_config),
+                            parsed_config=parsed_config,
                         ),
                         source=self,
                         logger=self.logger,
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
                         stream=self._make_default_stream(
                             stream_config=stream_config,
                             cursor=cursor,
-                            use_file_transfer=self._use_file_transfer(parsed_config),
+                            parsed_config=parsed_config,
                         ),
                         source=self,
                         logger=self.logger,
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
                     stream = self._make_default_stream(
                         stream_config=stream_config,
                         cursor=cursor,
-                        use_file_transfer=self._use_file_transfer(parsed_config),
+                        parsed_config=parsed_config,
                     )
                 streams.append(stream)
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
         self,
         stream_config: FileBasedStreamConfig,
         cursor: Optional[AbstractFileBasedCursor],
-        use_file_transfer: bool = False,
+        parsed_config: AbstractFileBasedSpec,
     ) -> AbstractFileBasedStream:
         return DefaultFileBasedStream(
             config=stream_config,
@@ -310,7 +310,8 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
             validation_policy=self._validate_and_get_validation_policy(stream_config),
             errors_collector=self.errors_collector,
             cursor=cursor,
-            use_file_transfer=use_file_transfer,
+            use_file_transfer=self._use_file_transfer(parsed_config),
+            preserve_directory_structure=self._preserve_directory_structure(parsed_config),
         )
     def _get_stream_from_catalog(
@@ -385,3 +386,25 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
             and parsed_config.delivery_method.delivery_type == "use_file_transfer"
         )
         return use_file_transfer
+    @staticmethod
+    def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
+        """
+        Determines whether to preserve directory structure during file transfer.
+        When enabled, files maintain their subdirectory paths in the destination.
+        When disabled, files are flattened to the root of the destination.
+        Args:
+            parsed_config: The parsed configuration containing delivery method settings
+        Returns:
+            True if directory structure should be preserved (default), False otherwise
+        """
+        if (
+            FileBasedSource._use_file_transfer(parsed_config)
+            and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
+            and parsed_config.delivery_method.preserve_directory_structure is not None
+        ):
+            return parsed_config.delivery_method.preserve_directory_structure
+        return True

airbyte_cdk/sources/file_based/file_based_stream_reader.py CHANGED Viewed

@@ -135,6 +135,17 @@ class AbstractFileBasedStreamReader(ABC):
             return use_file_transfer
         return False
+    def preserve_directory_structure(self) -> bool:
+        # fall back to preserve subdirectories if config is not present or incomplete
+        if (
+            self.use_file_transfer()
+            and self.config
+            and hasattr(self.config.delivery_method, "preserve_directory_structure")
+            and self.config.delivery_method.preserve_directory_structure is not None
+        ):
+            return self.config.delivery_method.preserve_directory_structure
+        return True
     @abstractmethod
     def get_file(
         self, file: RemoteFile, local_directory: str, logger: logging.Logger
@@ -159,10 +170,13 @@ class AbstractFileBasedStreamReader(ABC):
         """
         ...
-    @staticmethod
-    def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
-        # Remove left slashes from source path format to make relative path for writing locally
-        file_relative_path = file.uri.lstrip("/")
+    def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
+        preserve_directory_structure = self.preserve_directory_structure()
+        if preserve_directory_structure:
+            # Remove left slashes from source path format to make relative path for writing locally
+            file_relative_path = file.uri.lstrip("/")
+        else:
+            file_relative_path = path.basename(file.uri)
         local_file_path = path.join(local_directory, file_relative_path)
         # Ensure the local directory exists

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 import logging
+import os
 import traceback
 from datetime import datetime
 from io import BytesIO, IOBase
@@ -42,12 +43,34 @@ unstructured_partition_pdf = None
 unstructured_partition_docx = None
 unstructured_partition_pptx = None
+AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
+TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
+def get_nltk_temp_folder() -> str:
+    """
+    For non-root connectors /tmp is not currently writable, but we should allow it in the future.
+    It's safe to use /airbyte for now. Fallback to /tmp for local development.
+    """
+    try:
+        nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
+        os.makedirs(nltk_data_dir, exist_ok=True)
+    except OSError:
+        nltk_data_dir = TMP_NLTK_DATA_DIR
+        os.makedirs(nltk_data_dir, exist_ok=True)
+    return nltk_data_dir
 try:
+    nltk_data_dir = get_nltk_temp_folder()
+    nltk.data.path.append(nltk_data_dir)
     nltk.data.find("tokenizers/punkt.zip")
     nltk.data.find("tokenizers/punkt_tab.zip")
+    nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
 except LookupError:
-    nltk.download("punkt")
-    nltk.download("punkt_tab")
+    nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
+    nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
+    nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
 def optional_decode(contents: Union[str, bytes]) -> str:

airbyte_cdk/sources/file_based/stream/default_file_based_stream.py CHANGED Viewed

@@ -5,14 +5,17 @@
 import asyncio
 import itertools
 import traceback
+from collections import defaultdict
 from copy import deepcopy
 from functools import cache
-from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
+from os import path
+from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
 from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
 from airbyte_cdk.models import Type as MessageType
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
 from airbyte_cdk.sources.file_based.exceptions import (
+    DuplicatedFilesError,
     FileBasedSourceError,
     InvalidSchemaError,
     MissingSchemaError,
@@ -43,6 +46,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
     """
     FILE_TRANSFER_KW = "use_file_transfer"
+    PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
+    FILES_KEY = "files"
     DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
     ab_last_mod_col = "_ab_source_file_last_modified"
     ab_file_name_col = "_ab_source_file_url"
@@ -50,10 +55,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
     source_file_url = "source_file_url"
     airbyte_columns = [ab_last_mod_col, ab_file_name_col]
     use_file_transfer = False
+    preserve_directory_structure = True
     def __init__(self, **kwargs: Any):
         if self.FILE_TRANSFER_KW in kwargs:
             self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
+        if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
+            self.preserve_directory_structure = kwargs.pop(
+                self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
+            )
         super().__init__(**kwargs)
     @property
@@ -98,15 +108,33 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
         else:
             return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
+    def _duplicated_files_names(
+        self, slices: List[dict[str, List[RemoteFile]]]
+    ) -> List[dict[str, List[str]]]:
+        seen_file_names: Dict[str, List[str]] = defaultdict(list)
+        for file_slice in slices:
+            for file_found in file_slice[self.FILES_KEY]:
+                file_name = path.basename(file_found.uri)
+                seen_file_names[file_name].append(file_found.uri)
+        return [
+            {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
+        ]
     def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
         # Sort files by last_modified, uri and return them grouped by last_modified
         all_files = self.list_files()
         files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
         sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
         slices = [
-            {"files": list(group[1])}
+            {self.FILES_KEY: list(group[1])}
             for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
         ]
+        if slices and not self.preserve_directory_structure:
+            duplicated_files_names = self._duplicated_files_names(slices)
+            if duplicated_files_names:
+                raise DuplicatedFilesError(
+                    stream=self.name, duplicated_files_names=duplicated_files_names
+                )
         return slices
     def transform_record(

airbyte-cdk 6.18.0.dev3__py3-none-any.whl → 6.18.2__py3-none-any.whl

airbyte-cdk 6.18.0.dev3py3-none-any.whl → 6.18.2py3-none-any.whl