PyPI - airbyte-cdk - Versions diffs - 0.53.2__py3-none-any.whl → 0.53.4__py3-none-any.whl - Mend

airbyte-cdk 0.53.2py3-none-any.whl → 0.53.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of airbyte-cdk might be problematic. Click here for more details.

Files changed (19) hide show

airbyte_cdk/destinations/vector_db_based/config.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from typing import List, Literal, Optional, Union
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
@@ -16,11 +17,10 @@ class SeparatorSplitterConfigModel(BaseModel):
     )
     keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "By Separator"
-        schema_extra = {
-            "description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
-        }
+        description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
+        discriminator = "mode"
 class MarkdownHeaderSplitterConfigModel(BaseModel):
@@ -33,11 +33,10 @@ class MarkdownHeaderSplitterConfigModel(BaseModel):
         ge=1,
     )
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "By Markdown header"
-        schema_extra = {
-            "description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
-        }
+        description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
+        discriminator = "mode"
 class CodeSplitterConfigModel(BaseModel):
@@ -65,11 +64,12 @@ class CodeSplitterConfigModel(BaseModel):
         ],
     )
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "By Programming Language"
-        schema_extra = {
-            "description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
-        }
+        description = (
+            "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
+        )
+        discriminator = "mode"
 TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
@@ -128,11 +128,12 @@ class OpenAIEmbeddingConfigModel(BaseModel):
     mode: Literal["openai"] = Field("openai", const=True)
     openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "OpenAI"
-        schema_extra = {
-            "description": "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
-        }
+        description = (
+            "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
+        )
+        discriminator = "mode"
 class OpenAICompatibleEmbeddingConfigModel(BaseModel):
@@ -151,9 +152,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
         title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
     )
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "OpenAI-compatible"
-        schema_extra = {"description": "Use a service that's compatible with the OpenAI API to embed text."}
+        description = "Use a service that's compatible with the OpenAI API to embed text."
+        discriminator = "mode"
 class AzureOpenAIEmbeddingConfigModel(BaseModel):
@@ -177,21 +179,19 @@ class AzureOpenAIEmbeddingConfigModel(BaseModel):
         examples=["your-resource-name"],
     )
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Azure OpenAI"
-        schema_extra = {
-            "description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
-        }
+        description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
+        discriminator = "mode"
 class FakeEmbeddingConfigModel(BaseModel):
     mode: Literal["fake"] = Field("fake", const=True)
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Fake"
-        schema_extra = {
-            "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
-        }
+        description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
+        discriminator = "mode"
 class FromFieldEmbeddingConfigModel(BaseModel):
@@ -203,17 +203,17 @@ class FromFieldEmbeddingConfigModel(BaseModel):
         ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
     )
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "From Field"
-        schema_extra = {
-            "description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
-        }
+        description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
+        discriminator = "mode"
 class CohereEmbeddingConfigModel(BaseModel):
     mode: Literal["cohere"] = Field("cohere", const=True)
     cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Cohere"
-        schema_extra = {"description": "Use the Cohere API to embed text."}
+        description = "Use the Cohere API to embed text."
+        discriminator = "mode"

airbyte_cdk/sources/file_based/config/avro_format.py CHANGED Viewed

@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 class AvroFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Avro Format"
+        discriminator = "filetype"
     filetype: str = Field(
         "avro",

airbyte_cdk/sources/file_based/config/csv_format.py CHANGED Viewed

@@ -6,6 +6,7 @@ import codecs
 from enum import Enum
 from typing import Any, Dict, List, Optional, Set, Union
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field, ValidationError, root_validator, validator
@@ -21,8 +22,9 @@ class CsvHeaderDefinitionType(Enum):
 class CsvHeaderFromCsv(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "From CSV"
+        discriminator = "header_definition_type"
     header_definition_type: str = Field(
         CsvHeaderDefinitionType.FROM_CSV.value,
@@ -34,8 +36,9 @@ class CsvHeaderFromCsv(BaseModel):
 class CsvHeaderAutogenerated(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Autogenerated"
+        discriminator = "header_definition_type"
     header_definition_type: str = Field(
         CsvHeaderDefinitionType.AUTOGENERATED.value,
@@ -47,8 +50,9 @@ class CsvHeaderAutogenerated(BaseModel):
 class CsvHeaderUserProvided(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "User Provided"
+        discriminator = "header_definition_type"
     header_definition_type: str = Field(
         CsvHeaderDefinitionType.USER_PROVIDED.value,
@@ -74,8 +78,9 @@ DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
 class CsvFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "CSV Format"
+        discriminator = "filetype"
     filetype: str = Field(
         "csv",
@@ -123,7 +128,7 @@ class CsvFormat(BaseModel):
     )
     header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
         title="CSV Header Definition",
-        default=CsvHeaderFromCsv(),
+        default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
         description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
     )
     true_values: Set[str] = Field(

airbyte_cdk/sources/file_based/config/jsonl_format.py CHANGED Viewed

@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 class JsonlFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Jsonl Format"
+        discriminator = "filetype"
     filetype: str = Field(
         "jsonl",

airbyte_cdk/sources/file_based/config/parquet_format.py CHANGED Viewed

@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 class ParquetFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Parquet Format"
+        discriminator = "filetype"
     filetype: str = Field(
         "parquet",

airbyte_cdk/sources/file_based/config/unstructured_format.py CHANGED Viewed

@@ -2,15 +2,26 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
+from typing import Optional
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 class UnstructuredFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Document File Type Format (Experimental)"
-        schema_extra = {"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."}
+        description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
+        discriminator = "filetype"
     filetype: str = Field(
         "unstructured",
         const=True,
     )
+    skip_unprocessable_file_types: Optional[bool] = Field(
+        default=True,
+        title="Skip Unprocessable File Types",
+        description="If true, skip files that cannot be parsed because of their file type and log a warning. If false, fail the sync. Corrupted files with valid file types will still result in a failed sync.",
+        always_show=True,
+    )

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py CHANGED Viewed

@@ -6,6 +6,7 @@ from io import BytesIO, IOBase
 from typing import Any, Dict, Iterable, List, Mapping, Optional
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
+from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
@@ -60,11 +61,12 @@ class UnstructuredParser(FileTypeParser):
         stream_reader: AbstractFileBasedStreamReader,
         logger: logging.Logger,
     ) -> SchemaType:
+        format = _extract_format(config)
         with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
             filetype = self._get_filetype(file_handle, file)
             if filetype not in self._supported_file_types():
-                raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri)
+                self._handle_unprocessable_file(file, format, logger)
             return {
                 "content": {"type": "string"},
@@ -79,14 +81,16 @@ class UnstructuredParser(FileTypeParser):
         logger: logging.Logger,
         discovered_schema: Optional[Mapping[str, SchemaType]],
     ) -> Iterable[Dict[str, Any]]:
+        format = _extract_format(config)
         with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
-            markdown = self._read_file(file_handle, file)
-            yield {
-                "content": markdown,
-                "document_key": file.uri,
-            }
-    def _read_file(self, file_handle: IOBase, remote_file: RemoteFile) -> str:
+            markdown = self._read_file(file_handle, file, format, logger)
+            if markdown is not None:
+                yield {
+                    "content": markdown,
+                    "document_key": file.uri,
+                }
+    def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> Optional[str]:
         _import_unstructured()
         if (
             (not unstructured_partition_pdf)
@@ -104,7 +108,8 @@ class UnstructuredParser(FileTypeParser):
             decoded_content: str = unstructured_optional_decode(file_content)
             return decoded_content
         if filetype not in self._supported_file_types():
-            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri)
+            self._handle_unprocessable_file(remote_file, format, logger)
+            return None
         file: Any = file_handle
         if filetype == FileType.PDF:
@@ -120,6 +125,12 @@ class UnstructuredParser(FileTypeParser):
         return self._render_markdown(elements)
+    def _handle_unprocessable_file(self, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> None:
+        if format.skip_unprocessable_file_types:
+            logger.warn(f"File {remote_file.uri} cannot be parsed. Skipping it.")
+        else:
+            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri)
     def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
         """
         Detect the file type based on the file name and the file content.
@@ -172,3 +183,10 @@ class UnstructuredParser(FileTypeParser):
     @property
     def file_read_mode(self) -> FileReadMode:
         return FileReadMode.READ_BINARY
+def _extract_format(config: FileBasedStreamConfig) -> UnstructuredFormat:
+    config_format = config.format
+    if not isinstance(config_format, UnstructuredFormat):
+        raise ValueError(f"Invalid format config: {config_format}")
+    return config_format

airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py CHANGED Viewed

@@ -154,7 +154,7 @@ class ThreadBasedConcurrentStream(AbstractStream):
         if len(futures) < self._max_concurrent_tasks:
             return
-        for index in range(len(futures)):
+        for index in reversed(range(len(futures))):
             future = futures[index]
             optional_exception = future.exception()
             if optional_exception:

airbyte_cdk/utils/oneof_option_config.py ADDED Viewed

@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+from typing import Any, Dict
+class OneOfOptionConfig:
+    """
+    Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
+    Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
+    Usage:
+        ```python
+        class OptionModel(BaseModel):
+            mode: Literal["option_a"] = Field("option_a", const=True)
+            option_a_field: str = Field(...)
+            class Config(OneOfOptionConfig):
+                title = "Option A"
+                description = "Option A description"
+                discriminator = "mode"
+        ```
+    """
+    @staticmethod
+    def schema_extra(schema: Dict[str, Any], model: Any) -> None:
+        if hasattr(model.Config, "description"):
+            schema["description"] = model.Config.description
+        if hasattr(model.Config, "discriminator"):
+            schema.setdefault("required", []).append(model.Config.discriminator)

{airbyte_cdk-0.53.2.dist-info → airbyte_cdk-0.53.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: airbyte-cdk
-Version: 0.53.2
+Version: 0.53.4
 Summary: A framework for writing Airbyte Connectors.
 Home-page: https://github.com/airbytehq/airbyte
 Author: Airbyte

{airbyte_cdk-0.53.2.dist-info → airbyte_cdk-0.53.4.dist-info}/RECORD RENAMED Viewed

@@ -13,7 +13,7 @@ airbyte_cdk/connector_builder/models.py,sha256=U2LrL1syxZ0gQ3LgnwVj9ozL6uGH5f9bi
 airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6fQFbWKY,126
 airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
 airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=eAkzwTjBbXBhJ5GfPO5I53Zgpv5xQFLRQS8n4nuyPt0,1006
-airbyte_cdk/destinations/vector_db_based/config.py,sha256=tMp8blgdrI4t7a9Ri9Vydk0TOcRqLTHHUjVlXtc0Wa4,9562
+airbyte_cdk/destinations/vector_db_based/config.py,sha256=FrbW0RVzTrdMotgj7uPjUsEiD8Ij9_Z6FA3OXdqZv3Y,9812
 airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=ldrlmCT4gFHc_A5B_um4OteXg1OR0LGyDmswO1316tA,8649
 airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=davAE4UtrpWDjbV74tck5zvKksxizvSdF9X51WFMbW4,10913
 airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=58Uf34yIe0QHbnpbkS7rH2sqL7eLzwWUjx7X4yciyeA,3165
@@ -156,12 +156,12 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
 airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=DFUJZzUfl6lBbIEVk-BXFh-yGxXle0anM7eM2NsnCeQ,5019
 airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
-airbyte_cdk/sources/file_based/config/avro_format.py,sha256=oLJIuNInu-MgjkVFqwHvmQ4CPZa4NZingq_I0_trQ3g,589
-airbyte_cdk/sources/file_based/config/csv_format.py,sha256=xlBZ5WyAshagjjjbUV_je1JyZ1oY1GbIzJRUZ9UfSvo,7095
+airbyte_cdk/sources/file_based/config/avro_format.py,sha256=lQSEq5JZY0M5y9mW93R4EjrIb8brYXUgrXCY-6EMHww,711
+airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
 airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5x2BQVV_ZZcV5727gIypnfoIiI21X_dnkkjCAkQy3ZI,3967
-airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=usmTeTw8xw8OKwrz8MsiS5E1LQiVEbedGHMHNAfOOlk,252
-airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=O_Eq0yVzjPiKDz8H1-f9yMowtCcJwT9F2prNYpXZkp0,614
-airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=iiEGIPspyDcGY36cagqNV3CazEJdZoTrSZwpJZb_laE,430
+airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
+airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=8GTDTQyvS7pWLVG0LWirHVE1snHd0Au5R4Ym33-ezEg,736
+airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=8yc0TMhlf1bcJc34IXzYkYHQ5HpGN4rt1f3zKSiCeYk,934
 airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
 airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
 airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
@@ -171,7 +171,7 @@ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VVV829XszmDRqmgv6
 airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=b987gENSP649ijRd_33ZVJVNIlFMr-F-FkG333NkNFc,2235
 airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
 airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Hz_3GqCPKmTuHJgMHY_afD3Ul6YsF28lEPeJSnpvNc4,8776
-airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=OMx1Xpp_xgtcTehtpsz9GCvr86-fgdEmS4ev0VOgNZE,7213
+airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=DcRoivT3mwJY8aJjTIzDADKwMR0s6mp2RuCsj8l8Bko,8115
 airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
 airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
 airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
@@ -199,7 +199,7 @@ airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=-WETGIY5_QFmVeDFiqm4
 airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py,sha256=uAUhCkxFOaptDJfIEDmFnnF4xn4coG5kvE4B_5tVx14,1557
 airbyte_cdk/sources/streams/concurrent/partition_reader.py,sha256=H8sGVVGx6uKMSUehRaqmVbE19DE3cx3NivQ4sFj8wbk,1303
 airbyte_cdk/sources/streams/concurrent/state_converter.py,sha256=PwqcRVPR6LQxWL0yvPTp_u2Uh0hBJU-BDSjPKiyJVEk,4689
-airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py,sha256=ZfjrqY-I43H-qQHmwJnHnP_4snvPBFwD9dIVwV1gOqU,10833
+airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py,sha256=M7CpPPBswHTYjG4opiTOf5eWHOJ6i4TyP0v991pFxOo,10843
 airbyte_cdk/sources/streams/concurrent/partitions/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
 airbyte_cdk/sources/streams/concurrent/partitions/partition.py,sha256=tjXF8lZMvyfZaCYCHr5aTPwbVstmRjYZDwYAvLDY-ds,1312
 airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py,sha256=_ymkkBr71_qt1fW0_MUqw96OfNBkeJngXQ09yolEDHw,441
@@ -238,6 +238,7 @@ airbyte_cdk/utils/datetime_format_inferrer.py,sha256=gGKDQ3OdY18R5CVFhq4c7zB_E4C
 airbyte_cdk/utils/event_timing.py,sha256=Hn5kCc9xGKLcV5EYpJCZwNiz9neKKu2WG8FJF_hy278,2377
 airbyte_cdk/utils/is_cloud_environment.py,sha256=KAR_Ak_aD2X6a2zEtRAg3kaEHiJtOifWE1uwqW2-yHE,566
 airbyte_cdk/utils/mapping_helpers.py,sha256=tVkbgnxy12Ah2Jxh_3tKW7CTKTAVIcPexsBhsiyTbp4,1729
+airbyte_cdk/utils/oneof_option_config.py,sha256=N8EmWdYdwt0FM7fuShh6H8nj_r4KEL9tb2DJJtwsPow,1180
 airbyte_cdk/utils/schema_inferrer.py,sha256=D8vFVgeK6VLcAug4YVAHfa3D29On0A_nMlwq9SPlfPI,3799
 airbyte_cdk/utils/spec_schema_transformations.py,sha256=LGjSSk8lmBiC0GiHqxDwu_iMN6bCe05UMpz9e7nCw5E,741
 airbyte_cdk/utils/stream_status_utils.py,sha256=X1Vy7BhglycjdIWpfKDfwJussNCxYffelKt6Utjx-qY,1005
@@ -338,7 +339,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
 unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
 unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
-unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=dhWlq2qKuPBxCeVvFCSoySGXEbJCszunblWOjAnFpuw,11430
+unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=9xVFaFFHjnzZziVmoVmLTULdxANt_zSrwVgANAVytl4,11564
 unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
 unit_tests/sources/file_based/test_scenarios.py,sha256=2-9pqnfva3RDRyODy0xcK6mxrP_mHH5vLrmBhqgZO8o,9703
 unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
@@ -355,17 +356,17 @@ unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=LCoGa0fvOber
 unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=zgHjLfPASRwFxkubdRK0UkskGTOAdASpWHKucm0AmqM,22423
 unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
 unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=J66wfbAaflSe5y3ixCZ4tLPEQdU62eYj-pNXycCtK0U,14159
-unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=I9yY11rrtdybKl5C-yh3qwKL1_aCnNwcg1xcENujDak,5670
+unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=W6jbuX2BBwHECuTS8NUlPgjFptrXGDQW4tJZUKwcfR0,7028
 unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
 unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
-unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=M4Ol5y1WrUlNhSW2uyD4aUfoxeg2FrPKGHT5tfxXeBM,108612
+unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=ochAex6o44Ov7-KpTWnaYOZja_kzprBB1aM9eVQIHeg,109887
 unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
 unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
 unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
 unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
 unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=feSSViayuoxTquoRhMUg4Lcui7dtwWHQ1Fe5y9igWSo,8728
-unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=BuAPJMP1Aur35lN24S3mEwj9zl6OYWs7aI4sdC_dGwo,62420
+unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=3jeDxyLh6LgwK0wMhU884fqSXG47H3AWvIQDD15jO6c,64973
 unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=7CxIaqZxAGSPs4AtcKZ9FLVVYQPsS__uXi9wnQMKn3U,28322
 unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=Try0knJN5wfoGNO38QGoLGIcqSceSAQsUWO42CusNYI,33005
 unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -386,7 +387,7 @@ unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py,sha
 unit_tests/sources/streams/concurrent/test_cursor.py,sha256=sqH8xE3GamETSSVqsdKafziAAm-y_j_MegES_C5ExMM,5790
 unit_tests/sources/streams/concurrent/test_partition_reader.py,sha256=eM5dzfmLKm9Lj-BfQUjAZRhCZzfvhk7AkKpcHGcoPfg,931
 unit_tests/sources/streams/concurrent/test_state_converter.py,sha256=rvg8becWR1iPdm5TAanZssKj5_iw8dInE_uqmjqghZE,8349
-unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py,sha256=HJYZCwgSyGmKdQSInZK1rUDbtW5RepdOa05hC10RPe4,10894
+unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py,sha256=_jBMJIZ6Hu9mWX4v9SRUdtxvgntA-rQpNbbygBi6HXA,11629
 unit_tests/sources/streams/concurrent/scenarios/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
 unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py,sha256=x77AQf8_O4dQ2aF1o800CzI0hOEyU8ayxoNdSOvxkhM,10495
 unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py,sha256=FdgEO-bWA_IDFIJb0W83qE4QCCZ8eexbn_Mq8LJq0iE,5040
@@ -410,8 +411,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
 unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
 unit_tests/utils/test_stream_status_utils.py,sha256=N2TxwKge45RHUKFlPcP2o5jXYjJPKMKiu6Fm2_leZYY,3388
 unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
-airbyte_cdk-0.53.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
-airbyte_cdk-0.53.2.dist-info/METADATA,sha256=DC0qcLA2D2QlnQKG4S8Ojcm01bLCdN5msrVZy0T6DhI,11983
-airbyte_cdk-0.53.2.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
-airbyte_cdk-0.53.2.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
-airbyte_cdk-0.53.2.dist-info/RECORD,,
+airbyte_cdk-0.53.4.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
+airbyte_cdk-0.53.4.dist-info/METADATA,sha256=n_0ZAbpdj_k3YO8y-B1g9javdvN876EaSuA6-Cu5xQY,11983
+airbyte_cdk-0.53.4.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
+airbyte_cdk-0.53.4.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
+airbyte_cdk-0.53.4.dist-info/RECORD,,

unit_tests/sources/file_based/file_types/test_unstructured_parser.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datetime import datetime
 from unittest.mock import MagicMock, mock_open, patch
 import pytest
+from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
 from airbyte_cdk.sources.file_based.exceptions import RecordParseError
 from airbyte_cdk.sources.file_based.file_types import UnstructuredParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -17,37 +18,48 @@ FILE_URI = "path/to/file.xyz"
 @pytest.mark.parametrize(
-    "filetype, raises",
+    "filetype, format_config, raises",
     [
         pytest.param(
             FileType.MD,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             False,
             id="markdown file",
         ),
         pytest.param(
             FileType.CSV,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             True,
             id="wrong file format",
         ),
+        pytest.param(
+            FileType.CSV,
+            UnstructuredFormat(skip_unprocessable_file_types=True),
+            False,
+            id="wrong file format skipping",
+        ),
         pytest.param(
             FileType.PDF,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             False,
             id="pdf file",
         ),
         pytest.param(
             FileType.DOCX,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             False,
             id="docx file",
         ),
         pytest.param(
             FileType.PPTX,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             False,
             id="pptx file",
         ),
     ],
 )
 @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype")
-def test_infer_schema(mock_detect_filetype, filetype, raises):
+def test_infer_schema(mock_detect_filetype, filetype, format_config, raises):
     # use a fresh event loop to avoid leaking into other tests
     main_loop = asyncio.get_event_loop()
     loop = asyncio.new_event_loop()
@@ -59,11 +71,13 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
     fake_file.uri = FILE_URI
     logger = MagicMock()
     mock_detect_filetype.return_value = filetype
+    config = MagicMock()
+    config.format = format_config
     if raises:
         with pytest.raises(RecordParseError):
-            loop.run_until_complete(UnstructuredParser().infer_schema(MagicMock(), fake_file, stream_reader, logger))
+            loop.run_until_complete(UnstructuredParser().infer_schema(config, fake_file, stream_reader, logger))
     else:
-        schema = loop.run_until_complete(UnstructuredParser().infer_schema(MagicMock(), MagicMock(), MagicMock(), MagicMock()))
+        schema = loop.run_until_complete(UnstructuredParser().infer_schema(config, MagicMock(), MagicMock(), MagicMock()))
         assert schema == {
             "content": {"type": "string"},
             "document_key": {"type": "string"},
@@ -73,10 +87,11 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
 @pytest.mark.parametrize(
-    "filetype, parse_result, raises, expected_records",
+    "filetype, format_config, parse_result, raises, expected_records",
     [
         pytest.param(
             FileType.MD,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             "test",
             False,
             [
@@ -89,13 +104,23 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
         ),
         pytest.param(
             FileType.CSV,
-            "test",
+            UnstructuredFormat(skip_unprocessable_file_types=False),
+            None,
             True,
             None,
             id="wrong file format",
         ),
+        pytest.param(
+            FileType.CSV,
+            UnstructuredFormat(skip_unprocessable_file_types=True),
+            None,
+            False,
+            [],
+            id="skip_unprocessable_file_types",
+        ),
         pytest.param(
             FileType.PDF,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             [
                 Title("heading"),
                 Text("This is the text"),
@@ -113,6 +138,7 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
         ),
         pytest.param(
             FileType.PDF,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             [
                 Title("first level heading", metadata=ElementMetadata(category_depth=1)),
                 Title("second level heading", metadata=ElementMetadata(category_depth=2)),
@@ -128,6 +154,7 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
         ),
         pytest.param(
             FileType.DOCX,
+            UnstructuredFormat(skip_unprocessable_file_types=False),
             [
                 Title("heading"),
                 Text("This is the text"),
@@ -157,6 +184,7 @@ def test_parse_records(
     mock_partition_pptx,
     mock_partition_pdf,
     filetype,
+    format_config,
     parse_result,
     raises,
     expected_records,
@@ -166,6 +194,8 @@ def test_parse_records(
     fake_file = RemoteFile(uri=FILE_URI, last_modified=datetime.now())
     fake_file.uri = FILE_URI
     logger = MagicMock()
+    config = MagicMock()
+    config.format = format_config
     mock_detect_filetype.return_value = filetype
     mock_partition_docx.return_value = parse_result
     mock_partition_pptx.return_value = parse_result
@@ -173,6 +203,6 @@ def test_parse_records(
     mock_optional_decode.side_effect = lambda x: x.decode("utf-8")
     if raises:
         with pytest.raises(RecordParseError):
-            list(UnstructuredParser().parse_records(MagicMock(), fake_file, stream_reader, logger, MagicMock()))
+            list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock()))
     else:
-        assert list(UnstructuredParser().parse_records(MagicMock(), fake_file, stream_reader, logger, MagicMock())) == expected_records
+        assert list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())) == expected_records

unit_tests/sources/file_based/scenarios/csv_scenarios.py CHANGED Viewed

@@ -122,6 +122,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                     "type": "boolean",
                                                 },
                                             },
+                                            "required": ["filetype"],
                                         },
                                         {
                                             "title": "CSV Format",
@@ -200,6 +201,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                                     "type": "string",
                                                                 },
                                                             },
+                                                            "required": ["header_definition_type"],
                                                         },
                                                         {
                                                             "title": "Autogenerated",
@@ -212,6 +214,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                                     "type": "string",
                                                                 },
                                                             },
+                                                            "required": ["header_definition_type"],
                                                         },
                                                         {
                                                             "title": "User Provided",
@@ -230,7 +233,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                                     "items": {"type": "string"},
                                                                 },
                                                             },
-                                                            "required": ["column_names"],
+                                                            "required": ["column_names", "header_definition_type"],
                                                         },
                                                     ],
                                                 },
@@ -258,6 +261,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                     "enum": ["None", "Primitive Types Only"],
                                                 },
                                             },
+                                            "required": ["filetype"],
                                         },
                                         {
                                             "title": "Jsonl Format",
@@ -265,6 +269,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                             "properties": {
                                                 "filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
                                             },
+                                            "required": ["filetype"],
                                         },
                                         {
                                             "title": "Parquet Format",
@@ -283,6 +288,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                     "type": "boolean",
                                                 },
                                             },
+                                            "required": ["filetype"],
                                         },
                                         {
                                             "title": "Document File Type Format (Experimental)",
@@ -293,9 +299,17 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
                                                     "default": "unstructured",
                                                     "const": "unstructured",
                                                     "type": "string",
-                                                }
+                                                },
+                                                "skip_unprocessable_file_types": {
+                                                    "type": "boolean",
+                                                    "default": True,
+                                                    "title": "Skip Unprocessable File Types",
+                                                    "description": "If true, skip files that cannot be parsed because of their file type and log a warning. If false, fail the sync. Corrupted files with valid file types will still result in a failed sync.",
+                                                    "always_show": True,
+                                                },
                                             },
                                             "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
+                                            "required": ["filetype"],
                                         },
                                     ],
                                 },

unit_tests/sources/file_based/scenarios/unstructured_scenarios.py CHANGED Viewed

@@ -112,15 +112,16 @@ simple_markdown_scenario = (
     )
 ).build()
-unstructured_invalid_file_type_discover_scenario = (
+# If skip unprocessable file types is set to false, then discover will fail if it encounters a non-matching file type
+unstructured_invalid_file_type_discover_scenario_no_skip = (
     TestScenarioBuilder()
-    .set_name("unstructured_invalid_file_type_discover_scenario")
+    .set_name("unstructured_invalid_file_type_discover_scenario_no_skip")
     .set_config(
         {
             "streams": [
                 {
                     "name": "stream1",
-                    "format": {"filetype": "unstructured"},
+                    "format": {"filetype": "unstructured", "skip_unprocessable_file_types": False},
                     "globs": ["*"],
                     "validation_policy": "Emit Record",
                 }
@@ -172,6 +173,69 @@ unstructured_invalid_file_type_discover_scenario = (
     .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
 ).build()
+# If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
+unstructured_invalid_file_type_discover_scenario_skip = (
+    TestScenarioBuilder()
+    .set_name("unstructured_invalid_file_type_discover_scenario_skip")
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "stream1",
+                    "format": {"filetype": "unstructured", "skip_unprocessable_file_types": True},
+                    "globs": ["*"],
+                    "validation_policy": "Emit Record",
+                }
+            ]
+        }
+    )
+    .set_source_builder(
+        FileBasedSourceBuilder()
+        .set_files(
+            {
+                "a.txt": {
+                    "contents": bytes("Just a humble text file", "UTF-8"),
+                    "last_modified": "2023-06-05T03:54:07.000Z",
+                },
+            }
+        )
+        .set_file_type("unstructured")
+    )
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "document_key": {
+                                "type": ["null", "string"],
+                            },
+                            "content": {
+                                "type": ["null", "string"],
+                            },
+                            "_ab_source_file_last_modified": {
+                                "type": "string",
+                            },
+                            "_ab_source_file_url": {
+                                "type": "string",
+                            },
+                        },
+                    },
+                    "name": "stream1",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                }
+            ]
+        }
+    )
+    .set_expected_records([])
+).build()
+# TODO When working on https://github.com/airbytehq/airbyte/issues/31605, this test should be split into two tests:
+# 1. Test that the file is skipped if skip_unprocessable_file_types is set to true
+# 2. Test that the sync fails if skip_unprocessable_file_types is set to false
 unstructured_invalid_file_type_read_scenario = (
     TestScenarioBuilder()
     .set_name("unstructured_invalid_file_type_read_scenario")

unit_tests/sources/file_based/test_file_based_scenarios.py CHANGED Viewed

@@ -102,7 +102,8 @@ from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenari
 from unit_tests.sources.file_based.scenarios.unstructured_scenarios import (
     simple_markdown_scenario,
     simple_unstructured_scenario,
-    unstructured_invalid_file_type_discover_scenario,
+    unstructured_invalid_file_type_discover_scenario_no_skip,
+    unstructured_invalid_file_type_discover_scenario_skip,
     unstructured_invalid_file_type_read_scenario,
 )
 from unit_tests.sources.file_based.scenarios.user_input_schema_scenarios import (
@@ -203,7 +204,8 @@ discover_scenarios = [
     single_partitioned_parquet_scenario,
     simple_markdown_scenario,
     simple_unstructured_scenario,
-    unstructured_invalid_file_type_discover_scenario,
+    unstructured_invalid_file_type_discover_scenario_no_skip,
+    unstructured_invalid_file_type_discover_scenario_skip,
     unstructured_invalid_file_type_read_scenario,
 ]

unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py CHANGED Viewed

@@ -13,6 +13,8 @@ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partitio
 from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
 from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
+_MAX_CONCURRENT_TASKS = 2
 class ThreadBasedConcurrentStreamTest(unittest.TestCase):
     def setUp(self):
@@ -39,7 +41,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
             self._logger,
             self._message_repository,
             1,
-            2,
+            _MAX_CONCURRENT_TASKS,
             0,
             cursor=self._cursor,
         )
@@ -142,15 +144,33 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
         f2 = Mock()
         # Verify that the done() method will be called until only one future is still running
-        f1.done.return_value = False
+        f1.done.return_value = True
         f1.exception.return_value = None
-        f2.done.return_value = False
-        f2.exception.return_value = ValueError("An exception")
+        f2.done.return_value = True
+        f2.exception.return_value = ValueError("ERROR")
         futures = [f1, f2]
         with pytest.raises(RuntimeError):
             self._stream._wait_while_too_many_pending_futures(futures)
+    def test_given_removing_multiple_elements_when_pruning_then_fail_immediately(self):
+        # Verify that the done() method will be called until only one future is still running
+        futures = []
+        for _ in range(_MAX_CONCURRENT_TASKS + 1):
+            future = Mock()
+            future.done.return_value = True
+            future.exception.return_value = None
+            futures.append(future)
+        pending_future = Mock()
+        pending_future.done.return_value = False
+        pending_future.exception.return_value = None
+        futures.append(pending_future)
+        self._stream._wait_while_too_many_pending_futures(futures)
+        assert futures == [pending_future]
     def test_as_airbyte_stream(self):
         expected_airbyte_stream = AirbyteStream(
             name=self._name,

{airbyte_cdk-0.53.2.dist-info → airbyte_cdk-0.53.4.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{airbyte_cdk-0.53.2.dist-info → airbyte_cdk-0.53.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{airbyte_cdk-0.53.2.dist-info → airbyte_cdk-0.53.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

airbyte-cdk 0.53.2__py3-none-any.whl → 0.53.4__py3-none-any.whl

Potentially problematic release.

airbyte-cdk 0.53.2py3-none-any.whl → 0.53.4py3-none-any.whl