airbyte-cdk 0.51.10__py3-none-any.whl → 0.51.12__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -59,6 +59,22 @@ class FakeEmbeddingConfigModel(BaseModel):
59
59
  }
60
60
 
61
61
 
62
+ class FromFieldEmbeddingConfigModel(BaseModel):
63
+ mode: Literal["from_field"] = Field("from_field", const=True)
64
+ field_name: str = Field(
65
+ ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
66
+ )
67
+ dimensions: int = Field(
68
+ ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
69
+ )
70
+
71
+ class Config:
72
+ title = "From Field"
73
+ schema_extra = {
74
+ "description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
75
+ }
76
+
77
+
62
78
  class CohereEmbeddingConfigModel(BaseModel):
63
79
  mode: Literal["cohere"] = Field("cohere", const=True)
64
80
  cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
9
9
  import dpath.util
10
10
  from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
11
11
  from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
12
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
12
13
  from langchain.document_loaders.base import Document
13
14
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
15
  from langchain.utils import stringify_dict
@@ -21,8 +22,8 @@ METADATA_RECORD_ID_FIELD = "_ab_record_id"
21
22
  class Chunk:
22
23
  page_content: str
23
24
  metadata: Dict[str, Any]
24
- stream: str
25
- namespace: Optional[str] = None
25
+ record: AirbyteRecordMessage
26
+ embedding: Optional[List[float]] = None
26
27
 
27
28
 
28
29
  class DocumentProcessor:
@@ -66,11 +67,14 @@ class DocumentProcessor:
66
67
  """
67
68
  doc = self._generate_document(record)
68
69
  if doc is None:
69
- raise ValueError(f"Record {str(record.data)[:250]}... does not contain any text fields.")
70
- chunks = [
71
- Chunk(
72
- page_content=chunk_document.page_content, metadata=chunk_document.metadata, stream=record.stream, namespace=record.namespace
70
+ text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields"
71
+ raise AirbyteTracedException(
72
+ internal_message="No text fields found in record",
73
+ message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.",
74
+ failure_type=FailureType.config_error,
73
75
  )
76
+ chunks = [
77
+ Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
74
78
  for chunk_document in self._split_document(doc)
75
79
  ]
76
80
  id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
@@ -5,8 +5,15 @@
5
5
  from abc import ABC, abstractmethod
6
6
  from typing import List, Optional
7
7
 
8
- from airbyte_cdk.destinations.vector_db_based.config import CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, OpenAIEmbeddingConfigModel
8
+ from airbyte_cdk.destinations.vector_db_based.config import (
9
+ CohereEmbeddingConfigModel,
10
+ FakeEmbeddingConfigModel,
11
+ FromFieldEmbeddingConfigModel,
12
+ OpenAIEmbeddingConfigModel,
13
+ )
14
+ from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
9
15
  from airbyte_cdk.destinations.vector_db_based.utils import format_exception
16
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
10
17
  from langchain.embeddings.cohere import CohereEmbeddings
11
18
  from langchain.embeddings.fake import FakeEmbeddings
12
19
  from langchain.embeddings.openai import OpenAIEmbeddings
@@ -17,7 +24,7 @@ class Embedder(ABC):
17
24
  Embedder is an abstract class that defines the interface for embedding text.
18
25
 
19
26
  The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination.
20
- The destination connector is responsible to create an embedder instance and pass it to the indexer.
27
+ The destination connector is responsible to create an embedder instance and pass it to the writer.
21
28
  The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed.
22
29
  """
23
30
 
@@ -29,7 +36,11 @@ class Embedder(ABC):
29
36
  pass
30
37
 
31
38
  @abstractmethod
32
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
39
+ def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
40
+ """
41
+ Embed the text of each chunk and return the resulting embedding vectors.
42
+ If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
43
+ """
33
44
  pass
34
45
 
35
46
  @property
@@ -54,8 +65,8 @@ class OpenAIEmbedder(Embedder):
54
65
  return format_exception(e)
55
66
  return None
56
67
 
57
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
58
- return self.embeddings.embed_documents(texts)
68
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
69
+ return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
59
70
 
60
71
  @property
61
72
  def embedding_dimensions(self) -> int:
@@ -79,8 +90,8 @@ class CohereEmbedder(Embedder):
79
90
  return format_exception(e)
80
91
  return None
81
92
 
82
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
83
- return self.embeddings.embed_documents(texts)
93
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
94
+ return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
84
95
 
85
96
  @property
86
97
  def embedding_dimensions(self) -> int:
@@ -100,10 +111,54 @@ class FakeEmbedder(Embedder):
100
111
  return format_exception(e)
101
112
  return None
102
113
 
103
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
104
- return self.embeddings.embed_documents(texts)
114
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
115
+ return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
105
116
 
106
117
  @property
107
118
  def embedding_dimensions(self) -> int:
108
119
  # use same vector size as for OpenAI embeddings to keep it realistic
109
120
  return OPEN_AI_VECTOR_SIZE
121
+
122
+
123
+ class FromFieldEmbedder(Embedder):
124
+ def __init__(self, config: FromFieldEmbeddingConfigModel):
125
+ super().__init__()
126
+ self.config = config
127
+
128
+ def check(self) -> Optional[str]:
129
+ return None
130
+
131
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
132
+ """
133
+ From each chunk, pull the embedding from the field specified in the config.
134
+ Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
135
+ """
136
+ embeddings = []
137
+ for chunk in chunks:
138
+ data = chunk.record.data
139
+ if self.config.field_name not in data:
140
+ raise AirbyteTracedException(
141
+ internal_message="Embedding vector field not found",
142
+ failure_type=FailureType.config_error,
143
+ message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
144
+ )
145
+ field = data[self.config.field_name]
146
+ if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
147
+ raise AirbyteTracedException(
148
+ internal_message="Embedding vector field not a list of numbers",
149
+ failure_type=FailureType.config_error,
150
+ message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
151
+ )
152
+ if len(field) != self.config.dimensions:
153
+ raise AirbyteTracedException(
154
+ internal_message="Embedding vector field has wrong length",
155
+ failure_type=FailureType.config_error,
156
+ message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
157
+ )
158
+ embeddings.append(field)
159
+
160
+ return embeddings
161
+
162
+ @property
163
+ def embedding_dimensions(self) -> int:
164
+ return self.config.dimensions
@@ -7,7 +7,6 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar
8
8
 
9
9
  from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
10
- from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
11
10
  from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog
12
11
 
13
12
 
@@ -19,9 +18,8 @@ class Indexer(ABC):
19
18
  In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods.
20
19
  """
21
20
 
22
- def __init__(self, config: Any, embedder: Embedder):
21
+ def __init__(self, config: Any):
23
22
  self.config = config
24
- self.embedder = embedder
25
23
  pass
26
24
 
27
25
  def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None:
@@ -48,6 +48,6 @@ class BaseIntegrationTest(unittest.TestCase):
48
48
  type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
49
49
  )
50
50
 
51
- def setUp(self):
51
+ def setUp(self) -> None:
52
52
  with open("secrets/config.json", "r") as f:
53
53
  self.config = json.loads(f.read())
@@ -8,24 +8,27 @@ from typing import Iterable, List
8
8
  from airbyte_cdk.destinations.vector_db_based.batcher import Batcher
9
9
  from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
10
10
  from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
11
+ from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
11
12
  from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
12
13
  from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, ConfiguredAirbyteCatalog, Type
13
14
 
14
15
 
15
16
  class Writer:
16
17
  """
17
- The Writer class is orchestrating the document processor, the batcher and the indexer:
18
+ The Writer class is orchestrating the document processor, the batcher, the embedder and the indexer:
18
19
  * Incoming records are collected using the batcher
19
20
  * The document processor generates documents from all records in the batch
20
- * The indexer indexes the resulting documents in the destination
21
+ * The embedder embeds the documents
22
+ * The indexer indexes the resulting documents and their embeddings in the destination
21
23
 
22
24
  The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
23
25
  The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
24
26
  """
25
27
 
26
- def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, batch_size: int) -> None:
28
+ def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int) -> None:
27
29
  self.processing_config = processing_config
28
30
  self.indexer = indexer
31
+ self.embedder = embedder
29
32
  self.batcher = Batcher(batch_size, lambda batch: self._process_batch(batch))
30
33
 
31
34
  def _process_batch(self, batch: List[AirbyteRecordMessage]) -> None:
@@ -36,6 +39,9 @@ class Writer:
36
39
  documents.extend(record_documents)
37
40
  if record_id_to_delete is not None:
38
41
  ids_to_delete.append(record_id_to_delete)
42
+ embeddings = self.embedder.embed_chunks(documents)
43
+ for i, document in enumerate(documents):
44
+ document.embedding = embeddings[i]
39
45
  self.indexer.index(documents, ids_to_delete)
40
46
 
41
47
  def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
@@ -3,14 +3,16 @@
3
3
  #
4
4
 
5
5
  from pydantic import BaseModel, Field
6
- from typing_extensions import Literal
7
6
 
8
7
 
9
8
  class AvroFormat(BaseModel):
10
9
  class Config:
11
10
  title = "Avro Format"
12
11
 
13
- filetype: Literal["avro"] = "avro"
12
+ filetype: str = Field(
13
+ "avro",
14
+ const=True,
15
+ )
14
16
 
15
17
  double_as_string: bool = Field(
16
18
  title="Convert Double Fields to Strings",
@@ -7,7 +7,6 @@ from enum import Enum
7
7
  from typing import Any, Dict, List, Optional, Set, Union
8
8
 
9
9
  from pydantic import BaseModel, Field, ValidationError, root_validator, validator
10
- from typing_extensions import Literal
11
10
 
12
11
 
13
12
  class InferenceType(Enum):
@@ -25,7 +24,10 @@ class CsvHeaderFromCsv(BaseModel):
25
24
  class Config:
26
25
  title = "From CSV"
27
26
 
28
- header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
27
+ header_definition_type: str = Field(
28
+ CsvHeaderDefinitionType.FROM_CSV.value,
29
+ const=True,
30
+ )
29
31
 
30
32
  def has_header_row(self) -> bool:
31
33
  return True
@@ -35,7 +37,10 @@ class CsvHeaderAutogenerated(BaseModel):
35
37
  class Config:
36
38
  title = "Autogenerated"
37
39
 
38
- header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
40
+ header_definition_type: str = Field(
41
+ CsvHeaderDefinitionType.AUTOGENERATED.value,
42
+ const=True,
43
+ )
39
44
 
40
45
  def has_header_row(self) -> bool:
41
46
  return False
@@ -45,7 +50,10 @@ class CsvHeaderUserProvided(BaseModel):
45
50
  class Config:
46
51
  title = "User Provided"
47
52
 
48
- header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
53
+ header_definition_type: str = Field(
54
+ CsvHeaderDefinitionType.USER_PROVIDED.value,
55
+ const=True,
56
+ )
49
57
  column_names: List[str] = Field(
50
58
  title="Column Names",
51
59
  description="The column names that will be used while emitting the CSV records",
@@ -69,7 +77,10 @@ class CsvFormat(BaseModel):
69
77
  class Config:
70
78
  title = "CSV Format"
71
79
 
72
- filetype: Literal["csv"] = "csv"
80
+ filetype: str = Field(
81
+ "csv",
82
+ const=True,
83
+ )
73
84
  delimiter: str = Field(
74
85
  title="Delimiter",
75
86
  description="The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
@@ -2,12 +2,14 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from pydantic import BaseModel
6
- from typing_extensions import Literal
5
+ from pydantic import BaseModel, Field
7
6
 
8
7
 
9
8
  class JsonlFormat(BaseModel):
10
9
  class Config:
11
10
  title = "Jsonl Format"
12
11
 
13
- filetype: Literal["jsonl"] = "jsonl"
12
+ filetype: str = Field(
13
+ "jsonl",
14
+ const=True,
15
+ )
@@ -3,14 +3,16 @@
3
3
  #
4
4
 
5
5
  from pydantic import BaseModel, Field
6
- from typing_extensions import Literal
7
6
 
8
7
 
9
8
  class ParquetFormat(BaseModel):
10
9
  class Config:
11
10
  title = "Parquet Format"
12
11
 
13
- filetype: Literal["parquet"] = "parquet"
12
+ filetype: str = Field(
13
+ "parquet",
14
+ const=True,
15
+ )
14
16
  # This option is not recommended, but necessary for backwards compatibility
15
17
  decimal_as_float: bool = Field(
16
18
  title="Convert Decimal Fields to Floats",
@@ -11,6 +11,7 @@ from functools import partial
11
11
  from io import IOBase
12
12
  from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
13
13
 
14
+ from airbyte_cdk.models import FailureType
14
15
  from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
15
16
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
16
17
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
@@ -18,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
18
19
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
19
20
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
20
21
  from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
22
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
21
23
 
22
24
  DIALECT_NAME = "_config_dialect"
23
25
 
@@ -75,11 +77,12 @@ class _CsvReader:
75
77
  if isinstance(config_format.header_definition, CsvHeaderUserProvided):
76
78
  return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
77
79
 
78
- self._skip_rows(fp, config_format.skip_rows_before_header)
79
80
  if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
81
+ self._skip_rows(fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header)
80
82
  headers = self._auto_generate_headers(fp, dialect_name)
81
83
  else:
82
84
  # Then read the header
85
+ self._skip_rows(fp, config_format.skip_rows_before_header)
83
86
  reader = csv.reader(fp, dialect=dialect_name) # type: ignore
84
87
  headers = list(next(reader))
85
88
 
@@ -141,6 +144,12 @@ class CsvParser(FileTypeParser):
141
144
  if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
142
145
  break
143
146
 
147
+ if not type_inferrer_by_field:
148
+ raise AirbyteTracedException(
149
+ message=f"Could not infer schema as there are no rows in {file.uri}. If having an empty CSV file is expected, ignore this. "
150
+ f"Else, please contact Airbyte.",
151
+ failure_type=FailureType.config_error,
152
+ )
144
153
  schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()}
145
154
  data_generator.close()
146
155
  return schema
@@ -3,7 +3,6 @@
3
3
  #
4
4
 
5
5
  from datetime import datetime
6
- from typing import Optional
7
6
 
8
7
  from pydantic import BaseModel
9
8
 
@@ -15,11 +14,3 @@ class RemoteFile(BaseModel):
15
14
 
16
15
  uri: str
17
16
  last_modified: datetime
18
-
19
- def extension_agrees_with_file_type(self, file_type: Optional[str]) -> bool:
20
- extensions = self.uri.split(".")[1:]
21
- if not extensions:
22
- return True
23
- if not file_type:
24
- return True
25
- return any(file_type.casefold() in e.casefold() for e in extensions)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.51.10
3
+ Version: 0.51.12
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -14,13 +14,13 @@ airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6
14
14
  airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
15
15
  airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=z5Pqxxt3v-JCcJQ6sK4tAz5sg1FB-3wTCd2p85MhFzc,711
16
16
  airbyte_cdk/destinations/vector_db_based/batcher.py,sha256=U2RI0CACZ1WhJIdkC5oPlwZ90OZB40kyFCR5I7StqZw,1160
17
- airbyte_cdk/destinations/vector_db_based/config.py,sha256=JrK2yuxWGaBa2PBP8WUX8WMG2ah6ah-Z75nWCeoGm_A,3407
18
- airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=D4L2NiMLv-Wy2FuYjluk-tuHljmSWMYGLHPNwoMi_0c,5716
19
- airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=mB-VOVB3-DCKEtKCNhcuLH9OFFcyuP3yNCUf37OtV1M,3640
20
- airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=WFSPGJsvCRSqdZkgeM2RCSD8VPgHOnJmKbXSNK5XZos,2451
21
- airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=Ldf-nJYCCzemP0bkjMQiRiApNh252cBs8PHKlFabG6o,1799
17
+ airbyte_cdk/destinations/vector_db_based/config.py,sha256=xv5-IhPG_eKdRxstYmaFBUrYDECevE64OVRyUBZAJJw,4132
18
+ airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=KHvCSjt6amwpIYxK42OuT1Vh-RCA5A3vEBfAmowXpZI,6161
19
+ airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=0YLm5wmqiwCyUD_GWzqetWclzzKsADOfjXu0jMhQS1Y,6837
20
+ airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=DMic7D7ie4gGQ-yOgGXGYjBsY8H7X5O5Tz_sCr0ajBU,2327
21
+ airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
22
22
  airbyte_cdk/destinations/vector_db_based/utils.py,sha256=ngJ6hc9mmzgAEEBd9nuoRcPPFUKijv2CA6zZYUVRm54,240
23
- airbyte_cdk/destinations/vector_db_based/writer.py,sha256=uf3QLEKwaJsuGuhbfaJ4qbvr8EXc4NXr4J3JERP5dtI,2756
23
+ airbyte_cdk/destinations/vector_db_based/writer.py,sha256=zSVizVPupTjdF_dwniIU0RYnTZ9TMkizOK48tDNPxxk,3110
24
24
  airbyte_cdk/models/__init__.py,sha256=rDARocDgxf4_qI66Bm6dHTBoecbWguTClGVBmOBiI2o,1674
25
25
  airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
26
26
  airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
@@ -148,7 +148,7 @@ airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
148
148
  airbyte_cdk/sources/file_based/exceptions.py,sha256=4jwHysXT6r2o37Z7ch00nbo45wPVsmCorRYbYTmWd2Q,3656
149
149
  airbyte_cdk/sources/file_based/file_based_source.py,sha256=NCbXAGPWBQSPAf5x2U2eCdOLUd26RhO5s6K87_AF8Es,6931
150
150
  airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
151
- airbyte_cdk/sources/file_based/remote_file.py,sha256=s3Qz2N786yqSMXqcWmsTOvYhgs-ry0xFcn5fGyyz7bY,581
151
+ airbyte_cdk/sources/file_based/remote_file.py,sha256=xIDwDDBPhJI1K8YZuXjEfjxakZPMieBKJM6vmq6G5tw,248
152
152
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
153
153
  airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
154
154
  airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGfmQlFUMFR5h3ECc-VzBj4vair6_4WAL87AEI,277
@@ -156,17 +156,17 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
156
156
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
157
157
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
159
- airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
160
- airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
159
+ airbyte_cdk/sources/file_based/config/avro_format.py,sha256=oLJIuNInu-MgjkVFqwHvmQ4CPZa4NZingq_I0_trQ3g,589
160
+ airbyte_cdk/sources/file_based/config/csv_format.py,sha256=xlBZ5WyAshagjjjbUV_je1JyZ1oY1GbIzJRUZ9UfSvo,7095
161
161
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
162
- airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
163
- airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
162
+ airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=usmTeTw8xw8OKwrz8MsiS5E1LQiVEbedGHMHNAfOOlk,252
163
+ airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=O_Eq0yVzjPiKDz8H1-f9yMowtCcJwT9F2prNYpXZkp0,614
164
164
  airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
165
165
  airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=cz9po5Cn6u50uq3hDy46pqnPR4JDcnRItZX9k0WDUJU,520
166
166
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
167
167
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
168
168
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
169
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
169
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=_JQdzZMKkmcOPui7DyrF23twrT6wiXugXyKJEPhi-js,17252
170
170
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
171
171
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
172
172
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -328,13 +328,13 @@ unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9
328
328
  unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
329
329
  unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
330
  unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
331
- unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
331
+ unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=PalrxCRHAyoIp12IWWyePS9QF4LcvNVkqrKdwkrayJ4,22457
332
332
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
333
333
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
334
334
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
335
335
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
336
336
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
337
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
337
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=_5FYtChp1B8D_6gHbmyDNm19Aa9rCk4JDm7u47p-W3M,98717
338
338
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
339
339
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
340
340
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -365,8 +365,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
365
365
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
366
366
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
367
367
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
368
- airbyte_cdk-0.51.10.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
369
- airbyte_cdk-0.51.10.dist-info/METADATA,sha256=ohErCgNizWXyUJffgIlkMxjXMFfVSP9ipuZh7D1ruCQ,9895
370
- airbyte_cdk-0.51.10.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
371
- airbyte_cdk-0.51.10.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
372
- airbyte_cdk-0.51.10.dist-info/RECORD,,
368
+ airbyte_cdk-0.51.12.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
369
+ airbyte_cdk-0.51.12.dist-info/METADATA,sha256=kP39_c0A5hJ-e8yU-oZ-zAbknbhsFKaz7I11AoLyh5o,9895
370
+ airbyte_cdk-0.51.12.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
371
+ airbyte_cdk-0.51.12.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
372
+ airbyte_cdk-0.51.12.dist-info/RECORD,,
@@ -13,6 +13,7 @@ from unittest import TestCase, mock
13
13
  from unittest.mock import Mock
14
14
 
15
15
  import pytest
16
+ from airbyte_cdk.models import FailureType
16
17
  from airbyte_cdk.sources.file_based.config.csv_format import (
17
18
  DEFAULT_FALSE_VALUES,
18
19
  DEFAULT_TRUE_VALUES,
@@ -26,6 +27,7 @@ from airbyte_cdk.sources.file_based.exceptions import RecordParseError
26
27
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
27
28
  from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader
28
29
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
30
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
29
31
 
30
32
  PROPERTY_TYPES = {
31
33
  "col1": "null",
@@ -169,7 +171,7 @@ class SchemaInferenceTestCase(TestCase):
169
171
  self._config.get_input_schema.return_value = None
170
172
  self._config.format = self._config_format
171
173
 
172
- self._file = Mock(spec=RemoteFile)
174
+ self._file = RemoteFile(uri="a uri", last_modified=datetime.now())
173
175
  self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
174
176
  self._logger = Mock(spec=logging.Logger)
175
177
  self._csv_reader = Mock(spec=_CsvReader)
@@ -222,6 +224,12 @@ class SchemaInferenceTestCase(TestCase):
222
224
  # since the type is number, we know the string at the end was not considered
223
225
  assert inferred_schema == {self._HEADER_NAME: {"type": "number"}}
224
226
 
227
+ def test_given_empty_csv_file_when_infer_schema_then_raise_config_error(self) -> None:
228
+ self._csv_reader.read_data.return_value = []
229
+ with pytest.raises(AirbyteTracedException) as exception:
230
+ self._infer_schema()
231
+ assert exception.value.failure_type == FailureType.config_error
232
+
225
233
  def _test_infer_schema(self, rows: List[str], expected_type: str) -> None:
226
234
  self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in rows)
227
235
  inferred_schema = self._infer_schema()
@@ -260,7 +268,7 @@ class CsvReaderTest(unittest.TestCase):
260
268
  self._config.name = self._CONFIG_NAME
261
269
  self._config.format = self._config_format
262
270
 
263
- self._file = Mock(spec=RemoteFile)
271
+ self._file = RemoteFile(uri="a uri", last_modified=datetime.now())
264
272
  self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
265
273
  self._logger = Mock(spec=logging.Logger)
266
274
  self._csv_reader = _CsvReader()
@@ -292,6 +300,21 @@ class CsvReaderTest(unittest.TestCase):
292
300
 
293
301
  assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
294
302
 
303
+ def test_given_skip_row_before_and_after_and_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
304
+ self._config_format.header_definition = CsvHeaderAutogenerated()
305
+ self._config_format.skip_rows_before_header = 1
306
+ self._config_format.skip_rows_after_header = 2
307
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([
308
+ "skip before",
309
+ "skip after 1",
310
+ "skip after 2",
311
+ "0,1,2,3,4,5,6"
312
+ ]).build()
313
+
314
+ data_generator = self._read_data()
315
+
316
+ assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
317
+
295
318
  def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
296
319
  self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
297
320
  self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
@@ -111,7 +111,7 @@ single_csv_scenario = (
111
111
  "title": "Avro Format",
112
112
  "type": "object",
113
113
  "properties": {
114
- "filetype": {"title": "Filetype", "default": "avro", "enum": ["avro"], "type": "string"},
114
+ "filetype": {"title": "Filetype", "default": "avro", "const": "avro", "type": "string"},
115
115
  "double_as_string": {
116
116
  "title": "Convert Double Fields to Strings",
117
117
  "description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.",
@@ -124,7 +124,7 @@ single_csv_scenario = (
124
124
  "title": "CSV Format",
125
125
  "type": "object",
126
126
  "properties": {
127
- "filetype": {"title": "Filetype", "default": "csv", "enum": ["csv"], "type": "string"},
127
+ "filetype": {"title": "Filetype", "default": "csv", "const": "csv", "type": "string"},
128
128
  "delimiter": {
129
129
  "title": "Delimiter",
130
130
  "description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
@@ -190,21 +190,21 @@ single_csv_scenario = (
190
190
  "title": "From CSV",
191
191
  "type": "object",
192
192
  "properties": {
193
- "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
193
+ "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "const": "From CSV", "type": "string"},
194
194
  },
195
195
  },
196
196
  {
197
197
  "title": "Autogenerated",
198
198
  "type": "object",
199
199
  "properties": {
200
- "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
200
+ "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "const": "Autogenerated", "type": "string"},
201
201
  },
202
202
  },
203
203
  {
204
204
  "title": "User Provided",
205
205
  "type": "object",
206
206
  "properties": {
207
- "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
207
+ "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "const": "User Provided", "type": "string"},
208
208
  "column_names": {
209
209
  "title": "Column Names",
210
210
  "description": "The column names that will be used while emitting the CSV records",
@@ -247,7 +247,7 @@ single_csv_scenario = (
247
247
  "title": "Jsonl Format",
248
248
  "type": "object",
249
249
  "properties": {
250
- "filetype": {"title": "Filetype", "default": "jsonl", "enum": ["jsonl"], "type": "string"}
250
+ "filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
251
251
  },
252
252
  },
253
253
  {
@@ -257,7 +257,7 @@ single_csv_scenario = (
257
257
  "filetype": {
258
258
  "title": "Filetype",
259
259
  "default": "parquet",
260
- "enum": ["parquet"],
260
+ "const": "parquet",
261
261
  "type": "string",
262
262
  },
263
263
  "decimal_as_float": {