PyPI - airbyte-cdk - Versions diffs - 0.57.1__py3-none-any.whl → 0.57.3__py3-none-any.whl - Mend

airbyte-cdk 0.57.1py3-none-any.whl → 0.57.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

airbyte_cdk/destinations/vector_db_based/embedder.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import os
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import List, Optional, Union, cast
 from airbyte_cdk.destinations.vector_db_based.config import (
@@ -15,8 +16,8 @@ from airbyte_cdk.destinations.vector_db_based.config import (
     OpenAIEmbeddingConfigModel,
     ProcessingConfigModel,
 )
-from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
 from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
+from airbyte_cdk.models import AirbyteRecordMessage
 from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
 from langchain.embeddings.cohere import CohereEmbeddings
 from langchain.embeddings.fake import FakeEmbeddings
@@ -24,6 +25,12 @@ from langchain.embeddings.localai import LocalAIEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
+@dataclass
+class Document:
+    page_content: str
+    record: AirbyteRecordMessage
 class Embedder(ABC):
     """
     Embedder is an abstract class that defines the interface for embedding text.
@@ -41,7 +48,7 @@ class Embedder(ABC):
         pass
     @abstractmethod
-    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
+    def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
         """
         Embed the text of each chunk and return the resulting embedding vectors.
         If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
@@ -72,7 +79,7 @@ class BaseOpenAIEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
+    def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
         """
         Embed the text of each chunk and return the resulting embedding vectors.
@@ -80,9 +87,9 @@ class BaseOpenAIEmbedder(Embedder):
         It's still possible to run into the rate limit between each embed call because the available token budget hasn't recovered between the calls,
         but the built-in retry mechanism of the OpenAI client handles that.
         """
-        # Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of chunks that can be embedded at once without exhausting the limit in a single request
+        # Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of documents that can be embedded at once without exhausting the limit in a single request
         embedding_batch_size = OPEN_AI_TOKEN_LIMIT // self.chunk_size
-        batches = create_chunks(chunks, batch_size=embedding_batch_size)
+        batches = create_chunks(documents, batch_size=embedding_batch_size)
         embeddings: List[Optional[List[float]]] = []
         for batch in batches:
             embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
@@ -121,8 +128,8 @@ class CohereEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
-        return cast(List[Optional[List[float]]], self.embeddings.embed_documents([chunk.page_content or "" for chunk in chunks]))
+    def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
+        return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
     @property
     def embedding_dimensions(self) -> int:
@@ -142,8 +149,8 @@ class FakeEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
-        return cast(List[Optional[List[float]]], self.embeddings.embed_documents([chunk.page_content or "" for chunk in chunks]))
+    def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
+        return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
     @property
     def embedding_dimensions(self) -> int:
@@ -173,8 +180,8 @@ class OpenAICompatibleEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
-        return cast(List[Optional[List[float]]], self.embeddings.embed_documents([chunk.page_content or "" for chunk in chunks]))
+    def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
+        return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
     @property
     def embedding_dimensions(self) -> int:
@@ -190,32 +197,32 @@ class FromFieldEmbedder(Embedder):
     def check(self) -> Optional[str]:
         return None
-    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
+    def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
         """
         From each chunk, pull the embedding from the field specified in the config.
         Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
         """
         embeddings: List[Optional[List[float]]] = []
-        for chunk in chunks:
-            data = chunk.record.data
+        for document in documents:
+            data = document.record.data
             if self.config.field_name not in data:
                 raise AirbyteTracedException(
                     internal_message="Embedding vector field not found",
                     failure_type=FailureType.config_error,
-                    message=f"Record {str(data)[:250]}... in stream {chunk.record.stream}  does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
+                    message=f"Record {str(data)[:250]}... in stream {document.record.stream}  does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
                 )
             field = data[self.config.field_name]
             if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
                 raise AirbyteTracedException(
                     internal_message="Embedding vector field not a list of numbers",
                     failure_type=FailureType.config_error,
-                    message=f"Record {str(data)[:250]}...  in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
+                    message=f"Record {str(data)[:250]}...  in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
                 )
             if len(field) != self.config.dimensions:
                 raise AirbyteTracedException(
                     internal_message="Embedding vector field has wrong length",
                     failure_type=FailureType.config_error,
-                    message=f"Record {str(data)[:250]}...  in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
+                    message=f"Record {str(data)[:250]}...  in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
                 )
             embeddings.append(field)

airbyte_cdk/destinations/vector_db_based/writer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Dict, Iterable, List, Tuple
 from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
 from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
-from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
+from airbyte_cdk.destinations.vector_db_based.embedder import Document, Embedder
 from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
 from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
@@ -16,14 +16,14 @@ from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
 class Writer:
     """
     The Writer class is orchestrating the document processor, the embedder and the indexer:
-    * Incoming records are passed through the document processor to generate documents
-    * One the configured batch size is reached, the documents are passed to the embedder to generate embeddings
-    * The embedder embeds the documents
-    * The indexer deletes old documents by the associated record id before indexing the new ones
+    * Incoming records are passed through the document processor to generate chunks
+    * One the configured batch size is reached, the chunks are passed to the embedder to generate embeddings
+    * The embedder embeds the chunks
+    * The indexer deletes old chunks by the associated record id before indexing the new ones
     The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
     The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
-    The omit_raw_text parameter can be used to omit the raw text from the documents. This can be useful if the raw text is very large and not needed for the destination.
+    The omit_raw_text parameter can be used to omit the raw text from the chunks. This can be useful if the raw text is very large and not needed for the destination.
     """
     def __init__(
@@ -37,21 +37,29 @@ class Writer:
         self._init_batch()
     def _init_batch(self) -> None:
-        self.documents: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
+        self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
         self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
-        self.number_of_documents = 0
+        self.number_of_chunks = 0
+    def _convert_to_document(self, chunk: Chunk) -> Document:
+        """
+        Convert a chunk to a document for the embedder.
+        """
+        if chunk.page_content is None:
+            raise ValueError("Cannot embed a chunk without page content")
+        return Document(page_content=chunk.page_content, record=chunk.record)
     def _process_batch(self) -> None:
         for (namespace, stream), ids in self.ids_to_delete.items():
             self.indexer.delete(ids, namespace, stream)
-        for (namespace, stream), documents in self.documents.items():
-            embeddings = self.embedder.embed_chunks(documents)
-            for i, document in enumerate(documents):
+        for (namespace, stream), chunks in self.chunks.items():
+            embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
+            for i, document in enumerate(chunks):
                 document.embedding = embeddings[i]
                 if self.omit_raw_text:
                     document.page_content = None
-            self.indexer.index(documents, namespace, stream)
+            self.indexer.index(chunks, namespace, stream)
         self._init_batch()
@@ -65,12 +73,12 @@ class Writer:
                 self._process_batch()
                 yield message
             elif message.type == Type.RECORD:
-                record_documents, record_id_to_delete = self.processor.process(message.record)
-                self.documents[(message.record.namespace, message.record.stream)].extend(record_documents)
+                record_chunks, record_id_to_delete = self.processor.process(message.record)
+                self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
                 if record_id_to_delete is not None:
                     self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
-                self.number_of_documents += len(record_documents)
-                if self.number_of_documents >= self.batch_size:
+                self.number_of_chunks += len(record_chunks)
+                if self.number_of_chunks >= self.batch_size:
                     self._process_batch()
         self._process_batch()

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py CHANGED Viewed

@@ -3,6 +3,7 @@
 #
 import logging
 import traceback
+from datetime import datetime
 from io import BytesIO, IOBase
 from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
@@ -56,6 +57,8 @@ def user_error(e: Exception) -> bool:
     """
     Return True if this exception is caused by user error, False otherwise.
     """
+    if not isinstance(e, RecordParseError):
+        return False
     if not isinstance(e, requests.exceptions.RequestException):
         return False
     return bool(e.response and 400 <= e.response.status_code < 500)
@@ -164,10 +167,14 @@ class UnstructuredParser(FileTypeParser):
             return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
         elif format.processing.mode == "api":
             try:
-                result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy)
+                result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
             except Exception as e:
-                # Re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
+                # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
+                #
+                # For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
                 # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
+                if isinstance(e, RecordParseError):
+                    raise e
                 raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
             return result
@@ -210,7 +217,13 @@ class UnstructuredParser(FileTypeParser):
             return False, "Base URL must start with https://"
         try:
-            self._read_file_remotely(BytesIO(b"# Airbyte source connection test"), format_config.processing, FileType.MD, "auto")
+            self._read_file_remotely(
+                BytesIO(b"# Airbyte source connection test"),
+                format_config.processing,
+                FileType.MD,
+                "auto",
+                RemoteFile(uri="test", last_modified=datetime.now()),
+            )
         except Exception:
             return False, "".join(traceback.format_exc())
@@ -218,14 +231,16 @@ class UnstructuredParser(FileTypeParser):
     @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
     def _read_file_remotely_with_retries(
-        self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str
+        self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
     ) -> str:
         """
         Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
         """
-        return self._read_file_remotely(file_handle, format, filetype, strategy)
+        return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
-    def _read_file_remotely(self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str) -> str:
+    def _read_file_remotely(
+        self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
+    ) -> str:
         headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
         data = self._params_to_dict(format.parameters, strategy)
@@ -233,7 +248,13 @@ class UnstructuredParser(FileTypeParser):
         file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
         response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
-        response.raise_for_status()
+        if response.status_code == 422:
+            # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
+            raise self._create_parse_error(remote_file, response.json())
+        else:
+            # Other error statuses are raised as requests exceptions (retry everything except user errors)
+            response.raise_for_status()
         json_response = response.json()

{airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: airbyte-cdk
-Version: 0.57.1
+Version: 0.57.3
 Summary: A framework for writing Airbyte Connectors.
 Home-page: https://github.com/airbytehq/airbyte
 Author: Airbyte
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.8
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE.txt
-Requires-Dist: airbyte-protocol-models ==0.4.2
+Requires-Dist: airbyte-protocol-models ==0.5.1
 Requires-Dist: backoff
 Requires-Dist: dpath ~=2.0.1
 Requires-Dist: isodate ~=0.6.1

{airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/RECORD RENAMED Viewed

@@ -15,11 +15,11 @@ airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQw
 airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=eAkzwTjBbXBhJ5GfPO5I53Zgpv5xQFLRQS8n4nuyPt0,1006
 airbyte_cdk/destinations/vector_db_based/config.py,sha256=ibGA5rQepeiscNTZC6GlvYaL_m3EhNGJ0FkegYo1CiU,12324
 airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=DjyegDH7jYh7N_1JiYSDaqc3OMEb4V5R_LtGxaGOhW4,9083
-airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=SNNEePbKD_OlDCmT3ZvbbYGYc9K0sH-4eT1sR8cRZ90,11264
+airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=HxQCPwRpALmo5MvEhTuXdjinoBzlbNVvVunRw3EVgaE,11443
 airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=beiSi2Uu67EoTr7yQSaCJFAh9RajHFGKA4PoTbpTOqM,3243
 airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
 airbyte_cdk/destinations/vector_db_based/utils.py,sha256=dKpjY0QQVr5wMe6XHE_XdeL-nNqAew5InCfxkbyyf5A,1073
-airbyte_cdk/destinations/vector_db_based/writer.py,sha256=xMZVoOYshPp1bHm_9lJ752sUPGPasMKy2H_9TGMghj0,4053
+airbyte_cdk/destinations/vector_db_based/writer.py,sha256=2EOkNcOe9pKGz7DgC6iSHjWoxbYF0IZ7PcpsQYIOgUk,4394
 airbyte_cdk/models/__init__.py,sha256=Kg8YHBqUsNWHlAw-u3ZGdG4dxLh7qBlHhqMRfamNCRU,1708
 airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
 airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
@@ -177,7 +177,7 @@ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3
 airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
 airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
 airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
-airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=rY_4XuZ2nyI4487Bp7qKGM-hHGiDNxEy7w3kVrdx5vQ,15663
+airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=vw9As28N7QPWkSPq0v-mHvnRtoiM51q8swpX4iG-1vI,16694
 airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
 airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
 airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
@@ -375,7 +375,7 @@ unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=LCoGa0fvOber
 unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=zgHjLfPASRwFxkubdRK0UkskGTOAdASpWHKucm0AmqM,22423
 unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
 unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=J66wfbAaflSe5y3ixCZ4tLPEQdU62eYj-pNXycCtK0U,14159
-unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=ailTQJ3zWciQwNUsem6XdS4WETQW_OJrpXt6S78zb5Y,21153
+unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=Ts-5Zzcq8ETwgb2aCXlk5EIZtQTM9_OOx1HL8TNk0IU,22454
 unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
 unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
@@ -439,8 +439,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
 unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
 unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
 unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
-airbyte_cdk-0.57.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
-airbyte_cdk-0.57.1.dist-info/METADATA,sha256=ZAdnO4gzEpVsZ46_Wc1I2bkqr8SFvWk30G9G4oOPOik,11983
-airbyte_cdk-0.57.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-airbyte_cdk-0.57.1.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
-airbyte_cdk-0.57.1.dist-info/RECORD,,
+airbyte_cdk-0.57.3.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
+airbyte_cdk-0.57.3.dist-info/METADATA,sha256=rPyT57ZcShy6cYBeEwA7cf35b8NVHwZL3eX-3X-K_B8,11983
+airbyte_cdk-0.57.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+airbyte_cdk-0.57.3.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
+airbyte_cdk-0.57.3.dist-info/RECORD,,

unit_tests/sources/file_based/file_types/test_unstructured_parser.py CHANGED Viewed

@@ -315,7 +315,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
 @pytest.mark.parametrize(
-    "filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records",
+    "filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records, http_status_code",
     [
         pytest.param(
             FileType.PDF,
@@ -332,6 +332,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
                     "_ab_source_file_parse_error": None
                 }
             ],
+            200,
             id="basic_request",
         ),
         pytest.param(
@@ -349,6 +350,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
                     "_ab_source_file_parse_error": None
                 }
             ],
+            200,
             id="request_with_params",
         ),
         pytest.param(
@@ -366,6 +368,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
                     "_ab_source_file_parse_error": None
                 }
             ],
+            200,
             id="handle_markdown_locally",
         ),
         pytest.param(
@@ -394,6 +397,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
             ],
             True,
             None,
+            200,
             id="retry_and_raise_on_api_error",
         ),
         pytest.param(
@@ -422,6 +426,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
                     "_ab_source_file_parse_error": None
                 }
             ],
+            200,
             id="retry_and_recover",
         ),
         pytest.param(
@@ -438,6 +443,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
             ],
             True,
             None,
+            200,
             id="no_retry_on_unexpected_error",
         ),
         pytest.param(
@@ -454,8 +460,29 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
             ],
             True,
             None,
+            400,
             id="no_retry_on_400_error",
         ),
+        pytest.param(
+            FileType.PDF,
+            UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")),
+            None,
+            "test",
+            [{"detail": "Something went wrong"}],
+            [
+                call("https://api.unstructured.io/general/v0/general", headers={"accept": "application/json", "unstructured-api-key": "test"}, data={"strategy": "auto"}, files={"files": ("filename", mock.ANY, "application/pdf")}),
+            ],
+            False,
+            [
+                {
+                    "content": None,
+                    "document_key": FILE_URI,
+                    "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=[{'detail': 'Something went wrong'}]",
+                }
+            ],
+            422,
+            id="error_record_on_422_error",
+        ),
     ],
 )
 @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests")
@@ -473,6 +500,7 @@ def test_parse_records_remotely(
     expected_requests,
     raises,
     expected_records,
+    http_status_code
 ):
     stream_reader = MagicMock()
     mock_open(stream_reader.open_file, read_data=bytes(str(file_content), "utf-8"))
@@ -484,6 +512,7 @@ def test_parse_records_remotely(
     mock_detect_filetype.return_value = filetype
     mock_response = MagicMock()
     mock_response.json.return_value = json_response
+    mock_response.status_code = http_status_code
     if raises_for_status:
         mock_response.raise_for_status.side_effect = raises_for_status
     requests_mock.post.return_value = mock_response

{airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

airbyte-cdk 0.57.1__py3-none-any.whl → 0.57.3__py3-none-any.whl

airbyte-cdk 0.57.1py3-none-any.whl → 0.57.3py3-none-any.whl