PyPI - airbyte-cdk - Versions diffs - 0.51.10__py3-none-any.whl → 0.51.12__py3-none-any.whl - Mend

airbyte-cdk 0.51.10py3-none-any.whl → 0.51.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

airbyte_cdk/destinations/vector_db_based/config.py CHANGED Viewed

@@ -59,6 +59,22 @@ class FakeEmbeddingConfigModel(BaseModel):
         }
+class FromFieldEmbeddingConfigModel(BaseModel):
+    mode: Literal["from_field"] = Field("from_field", const=True)
+    field_name: str = Field(
+        ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
+    )
+    dimensions: int = Field(
+        ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
+    )
+    class Config:
+        title = "From Field"
+        schema_extra = {
+            "description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
+        }
 class CohereEmbeddingConfigModel(BaseModel):
     mode: Literal["cohere"] = Field("cohere", const=True)
     cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)

airbyte_cdk/destinations/vector_db_based/document_processor.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
 import dpath.util
 from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
 from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
+from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
 from langchain.document_loaders.base import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.utils import stringify_dict
@@ -21,8 +22,8 @@ METADATA_RECORD_ID_FIELD = "_ab_record_id"
 class Chunk:
     page_content: str
     metadata: Dict[str, Any]
-    stream: str
-    namespace: Optional[str] = None
+    record: AirbyteRecordMessage
+    embedding: Optional[List[float]] = None
 class DocumentProcessor:
@@ -66,11 +67,14 @@ class DocumentProcessor:
         """
         doc = self._generate_document(record)
         if doc is None:
-            raise ValueError(f"Record {str(record.data)[:250]}... does not contain any text fields.")
-        chunks = [
-            Chunk(
-                page_content=chunk_document.page_content, metadata=chunk_document.metadata, stream=record.stream, namespace=record.namespace
+            text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields"
+            raise AirbyteTracedException(
+                internal_message="No text fields found in record",
+                message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.",
+                failure_type=FailureType.config_error,
             )
+        chunks = [
+            Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
             for chunk_document in self._split_document(doc)
         ]
         id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None

airbyte_cdk/destinations/vector_db_based/embedder.py CHANGED Viewed

@@ -5,8 +5,15 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional
-from airbyte_cdk.destinations.vector_db_based.config import CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, OpenAIEmbeddingConfigModel
+from airbyte_cdk.destinations.vector_db_based.config import (
+    CohereEmbeddingConfigModel,
+    FakeEmbeddingConfigModel,
+    FromFieldEmbeddingConfigModel,
+    OpenAIEmbeddingConfigModel,
+)
+from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
 from airbyte_cdk.destinations.vector_db_based.utils import format_exception
+from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
 from langchain.embeddings.cohere import CohereEmbeddings
 from langchain.embeddings.fake import FakeEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
@@ -17,7 +24,7 @@ class Embedder(ABC):
     Embedder is an abstract class that defines the interface for embedding text.
     The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination.
-    The destination connector is responsible to create an embedder instance and pass it to the indexer.
+    The destination connector is responsible to create an embedder instance and pass it to the writer.
     The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed.
     """
@@ -29,7 +36,11 @@ class Embedder(ABC):
         pass
     @abstractmethod
-    def embed_texts(self, texts: List[str]) -> List[List[float]]:
+    def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
+        """
+        Embed the text of each chunk and return the resulting embedding vectors.
+        If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
+        """
         pass
     @property
@@ -54,8 +65,8 @@ class OpenAIEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_texts(self, texts: List[str]) -> List[List[float]]:
-        return self.embeddings.embed_documents(texts)
+    def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
+        return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
     @property
     def embedding_dimensions(self) -> int:
@@ -79,8 +90,8 @@ class CohereEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_texts(self, texts: List[str]) -> List[List[float]]:
-        return self.embeddings.embed_documents(texts)
+    def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
+        return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
     @property
     def embedding_dimensions(self) -> int:
@@ -100,10 +111,54 @@ class FakeEmbedder(Embedder):
             return format_exception(e)
         return None
-    def embed_texts(self, texts: List[str]) -> List[List[float]]:
-        return self.embeddings.embed_documents(texts)
+    def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
+        return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
     @property
     def embedding_dimensions(self) -> int:
         # use same vector size as for OpenAI embeddings to keep it realistic
         return OPEN_AI_VECTOR_SIZE
+class FromFieldEmbedder(Embedder):
+    def __init__(self, config: FromFieldEmbeddingConfigModel):
+        super().__init__()
+        self.config = config
+    def check(self) -> Optional[str]:
+        return None
+    def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
+        """
+        From each chunk, pull the embedding from the field specified in the config.
+        Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
+        """
+        embeddings = []
+        for chunk in chunks:
+            data = chunk.record.data
+            if self.config.field_name not in data:
+                raise AirbyteTracedException(
+                    internal_message="Embedding vector field not found",
+                    failure_type=FailureType.config_error,
+                    message=f"Record {str(data)[:250]}... in stream {chunk.record.stream}  does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
+                )
+            field = data[self.config.field_name]
+            if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
+                raise AirbyteTracedException(
+                    internal_message="Embedding vector field not a list of numbers",
+                    failure_type=FailureType.config_error,
+                    message=f"Record {str(data)[:250]}...  in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
+                )
+            if len(field) != self.config.dimensions:
+                raise AirbyteTracedException(
+                    internal_message="Embedding vector field has wrong length",
+                    failure_type=FailureType.config_error,
+                    message=f"Record {str(data)[:250]}...  in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
+                )
+            embeddings.append(field)
+        return embeddings
+    @property
+    def embedding_dimensions(self) -> int:
+        return self.config.dimensions

airbyte_cdk/destinations/vector_db_based/indexer.py CHANGED Viewed

@@ -7,7 +7,6 @@ from abc import ABC, abstractmethod
 from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar
 from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
-from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
 from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog
@@ -19,9 +18,8 @@ class Indexer(ABC):
     In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods.
     """
-    def __init__(self, config: Any, embedder: Embedder):
+    def __init__(self, config: Any):
         self.config = config
-        self.embedder = embedder
         pass
     def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None:

airbyte_cdk/destinations/vector_db_based/test_utils.py CHANGED Viewed

@@ -48,6 +48,6 @@ class BaseIntegrationTest(unittest.TestCase):
             type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
         )
-    def setUp(self):
+    def setUp(self) -> None:
         with open("secrets/config.json", "r") as f:
             self.config = json.loads(f.read())

airbyte_cdk/destinations/vector_db_based/writer.py CHANGED Viewed

@@ -8,24 +8,27 @@ from typing import Iterable, List
 from airbyte_cdk.destinations.vector_db_based.batcher import Batcher
 from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
 from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
+from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
 from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
 from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, ConfiguredAirbyteCatalog, Type
 class Writer:
     """
-    The Writer class is orchestrating the document processor, the batcher and the indexer:
+    The Writer class is orchestrating the document processor, the batcher, the embedder and the indexer:
     * Incoming records are collected using the batcher
     * The document processor generates documents from all records in the batch
-    * The indexer indexes the resulting documents in the destination
+    * The embedder embeds the documents
+    * The indexer indexes the resulting documents and their embeddings in the destination
     The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
     The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
     """
-    def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, batch_size: int) -> None:
+    def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int) -> None:
         self.processing_config = processing_config
         self.indexer = indexer
+        self.embedder = embedder
         self.batcher = Batcher(batch_size, lambda batch: self._process_batch(batch))
     def _process_batch(self, batch: List[AirbyteRecordMessage]) -> None:
@@ -36,6 +39,9 @@ class Writer:
             documents.extend(record_documents)
             if record_id_to_delete is not None:
                 ids_to_delete.append(record_id_to_delete)
+        embeddings = self.embedder.embed_chunks(documents)
+        for i, document in enumerate(documents):
+            document.embedding = embeddings[i]
         self.indexer.index(documents, ids_to_delete)
     def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:

airbyte_cdk/sources/file_based/config/avro_format.py CHANGED Viewed

@@ -3,14 +3,16 @@
 #
 from pydantic import BaseModel, Field
-from typing_extensions import Literal
 class AvroFormat(BaseModel):
     class Config:
         title = "Avro Format"
-    filetype: Literal["avro"] = "avro"
+    filetype: str = Field(
+        "avro",
+        const=True,
+    )
     double_as_string: bool = Field(
         title="Convert Double Fields to Strings",

airbyte_cdk/sources/file_based/config/csv_format.py CHANGED Viewed

@@ -7,7 +7,6 @@ from enum import Enum
 from typing import Any, Dict, List, Optional, Set, Union
 from pydantic import BaseModel, Field, ValidationError, root_validator, validator
-from typing_extensions import Literal
 class InferenceType(Enum):
@@ -25,7 +24,10 @@ class CsvHeaderFromCsv(BaseModel):
     class Config:
         title = "From CSV"
-    header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value  # type: ignore
+    header_definition_type: str = Field(
+        CsvHeaderDefinitionType.FROM_CSV.value,
+        const=True,
+    )
     def has_header_row(self) -> bool:
         return True
@@ -35,7 +37,10 @@ class CsvHeaderAutogenerated(BaseModel):
     class Config:
         title = "Autogenerated"
-    header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value  # type: ignore
+    header_definition_type: str = Field(
+        CsvHeaderDefinitionType.AUTOGENERATED.value,
+        const=True,
+    )
     def has_header_row(self) -> bool:
         return False
@@ -45,7 +50,10 @@ class CsvHeaderUserProvided(BaseModel):
     class Config:
         title = "User Provided"
-    header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value  # type: ignore
+    header_definition_type: str = Field(
+        CsvHeaderDefinitionType.USER_PROVIDED.value,
+        const=True,
+    )
     column_names: List[str] = Field(
         title="Column Names",
         description="The column names that will be used while emitting the CSV records",
@@ -69,7 +77,10 @@ class CsvFormat(BaseModel):
     class Config:
         title = "CSV Format"
-    filetype: Literal["csv"] = "csv"
+    filetype: str = Field(
+        "csv",
+        const=True,
+    )
     delimiter: str = Field(
         title="Delimiter",
         description="The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",

airbyte_cdk/sources/file_based/config/jsonl_format.py CHANGED Viewed

@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
-from pydantic import BaseModel
-from typing_extensions import Literal
+from pydantic import BaseModel, Field
 class JsonlFormat(BaseModel):
     class Config:
         title = "Jsonl Format"
-    filetype: Literal["jsonl"] = "jsonl"
+    filetype: str = Field(
+        "jsonl",
+        const=True,
+    )

airbyte_cdk/sources/file_based/config/parquet_format.py CHANGED Viewed

@@ -3,14 +3,16 @@
 #
 from pydantic import BaseModel, Field
-from typing_extensions import Literal
 class ParquetFormat(BaseModel):
     class Config:
         title = "Parquet Format"
-    filetype: Literal["parquet"] = "parquet"
+    filetype: str = Field(
+        "parquet",
+        const=True,
+    )
     # This option is not recommended, but necessary for backwards compatibility
     decimal_as_float: bool = Field(
         title="Convert Decimal Fields to Floats",

airbyte_cdk/sources/file_based/file_types/csv_parser.py CHANGED Viewed

@@ -11,6 +11,7 @@ from functools import partial
 from io import IOBase
 from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
+from airbyte_cdk.models import FailureType
 from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
@@ -18,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
 from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
+from airbyte_cdk.utils.traced_exception import AirbyteTracedException
 DIALECT_NAME = "_config_dialect"
@@ -75,11 +77,12 @@ class _CsvReader:
         if isinstance(config_format.header_definition, CsvHeaderUserProvided):
             return config_format.header_definition.column_names  # type: ignore  # should be CsvHeaderUserProvided given the type
-        self._skip_rows(fp, config_format.skip_rows_before_header)
         if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
+            self._skip_rows(fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header)
             headers = self._auto_generate_headers(fp, dialect_name)
         else:
             # Then read the header
+            self._skip_rows(fp, config_format.skip_rows_before_header)
             reader = csv.reader(fp, dialect=dialect_name)  # type: ignore
             headers = list(next(reader))
@@ -141,6 +144,12 @@ class CsvParser(FileTypeParser):
             if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
                 break
+        if not type_inferrer_by_field:
+            raise AirbyteTracedException(
+                message=f"Could not infer schema as there are no rows in {file.uri}. If having an empty CSV file is expected, ignore this. "
+                f"Else, please contact Airbyte.",
+                failure_type=FailureType.config_error,
+            )
         schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()}
         data_generator.close()
         return schema

airbyte_cdk/sources/file_based/remote_file.py CHANGED Viewed

@@ -3,7 +3,6 @@
 #
 from datetime import datetime
-from typing import Optional
 from pydantic import BaseModel
@@ -15,11 +14,3 @@ class RemoteFile(BaseModel):
     uri: str
     last_modified: datetime
-    def extension_agrees_with_file_type(self, file_type: Optional[str]) -> bool:
-        extensions = self.uri.split(".")[1:]
-        if not extensions:
-            return True
-        if not file_type:
-            return True
-        return any(file_type.casefold() in e.casefold() for e in extensions)

{airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: airbyte-cdk
-Version: 0.51.10
+Version: 0.51.12
 Summary: A framework for writing Airbyte Connectors.
 Home-page: https://github.com/airbytehq/airbyte
 Author: Airbyte

{airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/RECORD RENAMED Viewed

@@ -14,13 +14,13 @@ airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6
 airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
 airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=z5Pqxxt3v-JCcJQ6sK4tAz5sg1FB-3wTCd2p85MhFzc,711
 airbyte_cdk/destinations/vector_db_based/batcher.py,sha256=U2RI0CACZ1WhJIdkC5oPlwZ90OZB40kyFCR5I7StqZw,1160
-airbyte_cdk/destinations/vector_db_based/config.py,sha256=JrK2yuxWGaBa2PBP8WUX8WMG2ah6ah-Z75nWCeoGm_A,3407
-airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=D4L2NiMLv-Wy2FuYjluk-tuHljmSWMYGLHPNwoMi_0c,5716
-airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=mB-VOVB3-DCKEtKCNhcuLH9OFFcyuP3yNCUf37OtV1M,3640
-airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=WFSPGJsvCRSqdZkgeM2RCSD8VPgHOnJmKbXSNK5XZos,2451
-airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=Ldf-nJYCCzemP0bkjMQiRiApNh252cBs8PHKlFabG6o,1799
+airbyte_cdk/destinations/vector_db_based/config.py,sha256=xv5-IhPG_eKdRxstYmaFBUrYDECevE64OVRyUBZAJJw,4132
+airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=KHvCSjt6amwpIYxK42OuT1Vh-RCA5A3vEBfAmowXpZI,6161
+airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=0YLm5wmqiwCyUD_GWzqetWclzzKsADOfjXu0jMhQS1Y,6837
+airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=DMic7D7ie4gGQ-yOgGXGYjBsY8H7X5O5Tz_sCr0ajBU,2327
+airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
 airbyte_cdk/destinations/vector_db_based/utils.py,sha256=ngJ6hc9mmzgAEEBd9nuoRcPPFUKijv2CA6zZYUVRm54,240
-airbyte_cdk/destinations/vector_db_based/writer.py,sha256=uf3QLEKwaJsuGuhbfaJ4qbvr8EXc4NXr4J3JERP5dtI,2756
+airbyte_cdk/destinations/vector_db_based/writer.py,sha256=zSVizVPupTjdF_dwniIU0RYnTZ9TMkizOK48tDNPxxk,3110
 airbyte_cdk/models/__init__.py,sha256=rDARocDgxf4_qI66Bm6dHTBoecbWguTClGVBmOBiI2o,1674
 airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
 airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
@@ -148,7 +148,7 @@ airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
 airbyte_cdk/sources/file_based/exceptions.py,sha256=4jwHysXT6r2o37Z7ch00nbo45wPVsmCorRYbYTmWd2Q,3656
 airbyte_cdk/sources/file_based/file_based_source.py,sha256=NCbXAGPWBQSPAf5x2U2eCdOLUd26RhO5s6K87_AF8Es,6931
 airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
-airbyte_cdk/sources/file_based/remote_file.py,sha256=s3Qz2N786yqSMXqcWmsTOvYhgs-ry0xFcn5fGyyz7bY,581
+airbyte_cdk/sources/file_based/remote_file.py,sha256=xIDwDDBPhJI1K8YZuXjEfjxakZPMieBKJM6vmq6G5tw,248
 airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
 airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
 airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGfmQlFUMFR5h3ECc-VzBj4vair6_4WAL87AEI,277
@@ -156,17 +156,17 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
 airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
 airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
-airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
-airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
+airbyte_cdk/sources/file_based/config/avro_format.py,sha256=oLJIuNInu-MgjkVFqwHvmQ4CPZa4NZingq_I0_trQ3g,589
+airbyte_cdk/sources/file_based/config/csv_format.py,sha256=xlBZ5WyAshagjjjbUV_je1JyZ1oY1GbIzJRUZ9UfSvo,7095
 airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
-airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
-airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
+airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=usmTeTw8xw8OKwrz8MsiS5E1LQiVEbedGHMHNAfOOlk,252
+airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=O_Eq0yVzjPiKDz8H1-f9yMowtCcJwT9F2prNYpXZkp0,614
 airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
 airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=cz9po5Cn6u50uq3hDy46pqnPR4JDcnRItZX9k0WDUJU,520
 airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
 airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
 airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
-airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
+airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=_JQdzZMKkmcOPui7DyrF23twrT6wiXugXyKJEPhi-js,17252
 airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
 airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
 airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -328,13 +328,13 @@ unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9
 unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
 unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
-unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
+unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=PalrxCRHAyoIp12IWWyePS9QF4LcvNVkqrKdwkrayJ4,22457
 unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
 unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
 unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
 unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
-unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
+unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=_5FYtChp1B8D_6gHbmyDNm19Aa9rCk4JDm7u47p-W3M,98717
 unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
 unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
 unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -365,8 +365,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
 unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
 unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
 unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
-airbyte_cdk-0.51.10.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
-airbyte_cdk-0.51.10.dist-info/METADATA,sha256=ohErCgNizWXyUJffgIlkMxjXMFfVSP9ipuZh7D1ruCQ,9895
-airbyte_cdk-0.51.10.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-airbyte_cdk-0.51.10.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
-airbyte_cdk-0.51.10.dist-info/RECORD,,
+airbyte_cdk-0.51.12.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
+airbyte_cdk-0.51.12.dist-info/METADATA,sha256=kP39_c0A5hJ-e8yU-oZ-zAbknbhsFKaz7I11AoLyh5o,9895
+airbyte_cdk-0.51.12.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+airbyte_cdk-0.51.12.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
+airbyte_cdk-0.51.12.dist-info/RECORD,,

unit_tests/sources/file_based/file_types/test_csv_parser.py CHANGED Viewed

@@ -13,6 +13,7 @@ from unittest import TestCase, mock
 from unittest.mock import Mock
 import pytest
+from airbyte_cdk.models import FailureType
 from airbyte_cdk.sources.file_based.config.csv_format import (
     DEFAULT_FALSE_VALUES,
     DEFAULT_TRUE_VALUES,
@@ -26,6 +27,7 @@ from airbyte_cdk.sources.file_based.exceptions import RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
 from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
+from airbyte_cdk.utils.traced_exception import AirbyteTracedException
 PROPERTY_TYPES = {
     "col1": "null",
@@ -169,7 +171,7 @@ class SchemaInferenceTestCase(TestCase):
         self._config.get_input_schema.return_value = None
         self._config.format = self._config_format
-        self._file = Mock(spec=RemoteFile)
+        self._file = RemoteFile(uri="a uri", last_modified=datetime.now())
         self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
         self._logger = Mock(spec=logging.Logger)
         self._csv_reader = Mock(spec=_CsvReader)
@@ -222,6 +224,12 @@ class SchemaInferenceTestCase(TestCase):
         # since the type is number, we know the string at the end was not considered
         assert inferred_schema == {self._HEADER_NAME: {"type": "number"}}
+    def test_given_empty_csv_file_when_infer_schema_then_raise_config_error(self) -> None:
+        self._csv_reader.read_data.return_value = []
+        with pytest.raises(AirbyteTracedException) as exception:
+            self._infer_schema()
+        assert exception.value.failure_type == FailureType.config_error
     def _test_infer_schema(self, rows: List[str], expected_type: str) -> None:
         self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in rows)
         inferred_schema = self._infer_schema()
@@ -260,7 +268,7 @@ class CsvReaderTest(unittest.TestCase):
         self._config.name = self._CONFIG_NAME
         self._config.format = self._config_format
-        self._file = Mock(spec=RemoteFile)
+        self._file = RemoteFile(uri="a uri", last_modified=datetime.now())
         self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
         self._logger = Mock(spec=logging.Logger)
         self._csv_reader = _CsvReader()
@@ -292,6 +300,21 @@ class CsvReaderTest(unittest.TestCase):
         assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
+    def test_given_skip_row_before_and_after_and_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
+        self._config_format.header_definition = CsvHeaderAutogenerated()
+        self._config_format.skip_rows_before_header = 1
+        self._config_format.skip_rows_after_header = 2
+        self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([
+            "skip before",
+            "skip after 1",
+            "skip after 2",
+            "0,1,2,3,4,5,6"
+        ]).build()
+        data_generator = self._read_data()
+        assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
     def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
         self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
         self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()

unit_tests/sources/file_based/scenarios/csv_scenarios.py CHANGED Viewed

@@ -111,7 +111,7 @@ single_csv_scenario = (
                                             "title": "Avro Format",
                                             "type": "object",
                                             "properties": {
-                                                "filetype": {"title": "Filetype", "default": "avro", "enum": ["avro"], "type": "string"},
+                                                "filetype": {"title": "Filetype", "default": "avro", "const": "avro", "type": "string"},
                                                 "double_as_string": {
                                                     "title": "Convert Double Fields to Strings",
                                                     "description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.",
@@ -124,7 +124,7 @@ single_csv_scenario = (
                                             "title": "CSV Format",
                                             "type": "object",
                                             "properties": {
-                                                "filetype": {"title": "Filetype", "default": "csv", "enum": ["csv"], "type": "string"},
+                                                "filetype": {"title": "Filetype", "default": "csv", "const": "csv", "type": "string"},
                                                 "delimiter": {
                                                     "title": "Delimiter",
                                                     "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
@@ -190,21 +190,21 @@ single_csv_scenario = (
                                                             "title": "From CSV",
                                                             "type": "object",
                                                             "properties": {
-                                                                "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "const": "From CSV", "type": "string"},
                                                             },
                                                         },
                                                         {
                                                             "title": "Autogenerated",
                                                             "type": "object",
                                                             "properties": {
-                                                                "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "const": "Autogenerated", "type": "string"},
                                                             },
                                                         },
                                                         {
                                                             "title": "User Provided",
                                                             "type": "object",
                                                             "properties": {
-                                                                "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "const": "User Provided", "type": "string"},
                                                                 "column_names": {
                                                                     "title": "Column Names",
                                                                     "description": "The column names that will be used while emitting the CSV records",
@@ -247,7 +247,7 @@ single_csv_scenario = (
                                             "title": "Jsonl Format",
                                             "type": "object",
                                             "properties": {
-                                                "filetype": {"title": "Filetype", "default": "jsonl", "enum": ["jsonl"], "type": "string"}
+                                                "filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
                                             },
                                         },
                                         {
@@ -257,7 +257,7 @@ single_csv_scenario = (
                                                 "filetype": {
                                                     "title": "Filetype",
                                                     "default": "parquet",
-                                                    "enum": ["parquet"],
+                                                    "const": "parquet",
                                                     "type": "string",
                                                 },
                                                 "decimal_as_float": {

{airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

airbyte-cdk 0.51.10__py3-none-any.whl → 0.51.12__py3-none-any.whl

airbyte-cdk 0.51.10py3-none-any.whl → 0.51.12py3-none-any.whl