airbyte-cdk 0.51.10__py3-none-any.whl → 0.51.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/destinations/vector_db_based/config.py +16 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +10 -6
- airbyte_cdk/destinations/vector_db_based/embedder.py +64 -9
- airbyte_cdk/destinations/vector_db_based/indexer.py +1 -3
- airbyte_cdk/destinations/vector_db_based/test_utils.py +1 -1
- airbyte_cdk/destinations/vector_db_based/writer.py +9 -3
- airbyte_cdk/sources/file_based/config/avro_format.py +4 -2
- airbyte_cdk/sources/file_based/config/csv_format.py +16 -5
- airbyte_cdk/sources/file_based/config/jsonl_format.py +5 -3
- airbyte_cdk/sources/file_based/config/parquet_format.py +4 -2
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +10 -1
- airbyte_cdk/sources/file_based/remote_file.py +0 -9
- {airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/RECORD +19 -19
- unit_tests/sources/file_based/file_types/test_csv_parser.py +25 -2
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +7 -7
- {airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.51.10.dist-info → airbyte_cdk-0.51.12.dist-info}/top_level.txt +0 -0
@@ -59,6 +59,22 @@ class FakeEmbeddingConfigModel(BaseModel):
|
|
59
59
|
}
|
60
60
|
|
61
61
|
|
62
|
+
class FromFieldEmbeddingConfigModel(BaseModel):
|
63
|
+
mode: Literal["from_field"] = Field("from_field", const=True)
|
64
|
+
field_name: str = Field(
|
65
|
+
..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
|
66
|
+
)
|
67
|
+
dimensions: int = Field(
|
68
|
+
..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
|
69
|
+
)
|
70
|
+
|
71
|
+
class Config:
|
72
|
+
title = "From Field"
|
73
|
+
schema_extra = {
|
74
|
+
"description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
|
75
|
+
}
|
76
|
+
|
77
|
+
|
62
78
|
class CohereEmbeddingConfigModel(BaseModel):
|
63
79
|
mode: Literal["cohere"] = Field("cohere", const=True)
|
64
80
|
cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
|
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
|
9
9
|
import dpath.util
|
10
10
|
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
11
11
|
from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
|
12
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
12
13
|
from langchain.document_loaders.base import Document
|
13
14
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14
15
|
from langchain.utils import stringify_dict
|
@@ -21,8 +22,8 @@ METADATA_RECORD_ID_FIELD = "_ab_record_id"
|
|
21
22
|
class Chunk:
|
22
23
|
page_content: str
|
23
24
|
metadata: Dict[str, Any]
|
24
|
-
|
25
|
-
|
25
|
+
record: AirbyteRecordMessage
|
26
|
+
embedding: Optional[List[float]] = None
|
26
27
|
|
27
28
|
|
28
29
|
class DocumentProcessor:
|
@@ -66,11 +67,14 @@ class DocumentProcessor:
|
|
66
67
|
"""
|
67
68
|
doc = self._generate_document(record)
|
68
69
|
if doc is None:
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
70
|
+
text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields"
|
71
|
+
raise AirbyteTracedException(
|
72
|
+
internal_message="No text fields found in record",
|
73
|
+
message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.",
|
74
|
+
failure_type=FailureType.config_error,
|
73
75
|
)
|
76
|
+
chunks = [
|
77
|
+
Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
|
74
78
|
for chunk_document in self._split_document(doc)
|
75
79
|
]
|
76
80
|
id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
|
@@ -5,8 +5,15 @@
|
|
5
5
|
from abc import ABC, abstractmethod
|
6
6
|
from typing import List, Optional
|
7
7
|
|
8
|
-
from airbyte_cdk.destinations.vector_db_based.config import
|
8
|
+
from airbyte_cdk.destinations.vector_db_based.config import (
|
9
|
+
CohereEmbeddingConfigModel,
|
10
|
+
FakeEmbeddingConfigModel,
|
11
|
+
FromFieldEmbeddingConfigModel,
|
12
|
+
OpenAIEmbeddingConfigModel,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
9
15
|
from airbyte_cdk.destinations.vector_db_based.utils import format_exception
|
16
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
10
17
|
from langchain.embeddings.cohere import CohereEmbeddings
|
11
18
|
from langchain.embeddings.fake import FakeEmbeddings
|
12
19
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
@@ -17,7 +24,7 @@ class Embedder(ABC):
|
|
17
24
|
Embedder is an abstract class that defines the interface for embedding text.
|
18
25
|
|
19
26
|
The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination.
|
20
|
-
The destination connector is responsible to create an embedder instance and pass it to the
|
27
|
+
The destination connector is responsible to create an embedder instance and pass it to the writer.
|
21
28
|
The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed.
|
22
29
|
"""
|
23
30
|
|
@@ -29,7 +36,11 @@ class Embedder(ABC):
|
|
29
36
|
pass
|
30
37
|
|
31
38
|
@abstractmethod
|
32
|
-
def
|
39
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
|
40
|
+
"""
|
41
|
+
Embed the text of each chunk and return the resulting embedding vectors.
|
42
|
+
If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
|
43
|
+
"""
|
33
44
|
pass
|
34
45
|
|
35
46
|
@property
|
@@ -54,8 +65,8 @@ class OpenAIEmbedder(Embedder):
|
|
54
65
|
return format_exception(e)
|
55
66
|
return None
|
56
67
|
|
57
|
-
def
|
58
|
-
return self.embeddings.embed_documents(
|
68
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
69
|
+
return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
|
59
70
|
|
60
71
|
@property
|
61
72
|
def embedding_dimensions(self) -> int:
|
@@ -79,8 +90,8 @@ class CohereEmbedder(Embedder):
|
|
79
90
|
return format_exception(e)
|
80
91
|
return None
|
81
92
|
|
82
|
-
def
|
83
|
-
return self.embeddings.embed_documents(
|
93
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
94
|
+
return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
|
84
95
|
|
85
96
|
@property
|
86
97
|
def embedding_dimensions(self) -> int:
|
@@ -100,10 +111,54 @@ class FakeEmbedder(Embedder):
|
|
100
111
|
return format_exception(e)
|
101
112
|
return None
|
102
113
|
|
103
|
-
def
|
104
|
-
return self.embeddings.embed_documents(
|
114
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
115
|
+
return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
|
105
116
|
|
106
117
|
@property
|
107
118
|
def embedding_dimensions(self) -> int:
|
108
119
|
# use same vector size as for OpenAI embeddings to keep it realistic
|
109
120
|
return OPEN_AI_VECTOR_SIZE
|
121
|
+
|
122
|
+
|
123
|
+
class FromFieldEmbedder(Embedder):
|
124
|
+
def __init__(self, config: FromFieldEmbeddingConfigModel):
|
125
|
+
super().__init__()
|
126
|
+
self.config = config
|
127
|
+
|
128
|
+
def check(self) -> Optional[str]:
|
129
|
+
return None
|
130
|
+
|
131
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
132
|
+
"""
|
133
|
+
From each chunk, pull the embedding from the field specified in the config.
|
134
|
+
Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
|
135
|
+
"""
|
136
|
+
embeddings = []
|
137
|
+
for chunk in chunks:
|
138
|
+
data = chunk.record.data
|
139
|
+
if self.config.field_name not in data:
|
140
|
+
raise AirbyteTracedException(
|
141
|
+
internal_message="Embedding vector field not found",
|
142
|
+
failure_type=FailureType.config_error,
|
143
|
+
message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
|
144
|
+
)
|
145
|
+
field = data[self.config.field_name]
|
146
|
+
if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
|
147
|
+
raise AirbyteTracedException(
|
148
|
+
internal_message="Embedding vector field not a list of numbers",
|
149
|
+
failure_type=FailureType.config_error,
|
150
|
+
message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
151
|
+
)
|
152
|
+
if len(field) != self.config.dimensions:
|
153
|
+
raise AirbyteTracedException(
|
154
|
+
internal_message="Embedding vector field has wrong length",
|
155
|
+
failure_type=FailureType.config_error,
|
156
|
+
message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
157
|
+
)
|
158
|
+
embeddings.append(field)
|
159
|
+
|
160
|
+
return embeddings
|
161
|
+
|
162
|
+
@property
|
163
|
+
def embedding_dimensions(self) -> int:
|
164
|
+
return self.config.dimensions
|
@@ -7,7 +7,6 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar
|
8
8
|
|
9
9
|
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
10
|
-
from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
|
11
10
|
from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog
|
12
11
|
|
13
12
|
|
@@ -19,9 +18,8 @@ class Indexer(ABC):
|
|
19
18
|
In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods.
|
20
19
|
"""
|
21
20
|
|
22
|
-
def __init__(self, config: Any
|
21
|
+
def __init__(self, config: Any):
|
23
22
|
self.config = config
|
24
|
-
self.embedder = embedder
|
25
23
|
pass
|
26
24
|
|
27
25
|
def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None:
|
@@ -48,6 +48,6 @@ class BaseIntegrationTest(unittest.TestCase):
|
|
48
48
|
type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
|
49
49
|
)
|
50
50
|
|
51
|
-
def setUp(self):
|
51
|
+
def setUp(self) -> None:
|
52
52
|
with open("secrets/config.json", "r") as f:
|
53
53
|
self.config = json.loads(f.read())
|
@@ -8,24 +8,27 @@ from typing import Iterable, List
|
|
8
8
|
from airbyte_cdk.destinations.vector_db_based.batcher import Batcher
|
9
9
|
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
10
10
|
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
|
11
|
+
from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
|
11
12
|
from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
|
12
13
|
from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, ConfiguredAirbyteCatalog, Type
|
13
14
|
|
14
15
|
|
15
16
|
class Writer:
|
16
17
|
"""
|
17
|
-
The Writer class is orchestrating the document processor, the batcher and the indexer:
|
18
|
+
The Writer class is orchestrating the document processor, the batcher, the embedder and the indexer:
|
18
19
|
* Incoming records are collected using the batcher
|
19
20
|
* The document processor generates documents from all records in the batch
|
20
|
-
* The
|
21
|
+
* The embedder embeds the documents
|
22
|
+
* The indexer indexes the resulting documents and their embeddings in the destination
|
21
23
|
|
22
24
|
The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
|
23
25
|
The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
|
24
26
|
"""
|
25
27
|
|
26
|
-
def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, batch_size: int) -> None:
|
28
|
+
def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int) -> None:
|
27
29
|
self.processing_config = processing_config
|
28
30
|
self.indexer = indexer
|
31
|
+
self.embedder = embedder
|
29
32
|
self.batcher = Batcher(batch_size, lambda batch: self._process_batch(batch))
|
30
33
|
|
31
34
|
def _process_batch(self, batch: List[AirbyteRecordMessage]) -> None:
|
@@ -36,6 +39,9 @@ class Writer:
|
|
36
39
|
documents.extend(record_documents)
|
37
40
|
if record_id_to_delete is not None:
|
38
41
|
ids_to_delete.append(record_id_to_delete)
|
42
|
+
embeddings = self.embedder.embed_chunks(documents)
|
43
|
+
for i, document in enumerate(documents):
|
44
|
+
document.embedding = embeddings[i]
|
39
45
|
self.indexer.index(documents, ids_to_delete)
|
40
46
|
|
41
47
|
def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
|
@@ -3,14 +3,16 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from pydantic import BaseModel, Field
|
6
|
-
from typing_extensions import Literal
|
7
6
|
|
8
7
|
|
9
8
|
class AvroFormat(BaseModel):
|
10
9
|
class Config:
|
11
10
|
title = "Avro Format"
|
12
11
|
|
13
|
-
filetype:
|
12
|
+
filetype: str = Field(
|
13
|
+
"avro",
|
14
|
+
const=True,
|
15
|
+
)
|
14
16
|
|
15
17
|
double_as_string: bool = Field(
|
16
18
|
title="Convert Double Fields to Strings",
|
@@ -7,7 +7,6 @@ from enum import Enum
|
|
7
7
|
from typing import Any, Dict, List, Optional, Set, Union
|
8
8
|
|
9
9
|
from pydantic import BaseModel, Field, ValidationError, root_validator, validator
|
10
|
-
from typing_extensions import Literal
|
11
10
|
|
12
11
|
|
13
12
|
class InferenceType(Enum):
|
@@ -25,7 +24,10 @@ class CsvHeaderFromCsv(BaseModel):
|
|
25
24
|
class Config:
|
26
25
|
title = "From CSV"
|
27
26
|
|
28
|
-
header_definition_type:
|
27
|
+
header_definition_type: str = Field(
|
28
|
+
CsvHeaderDefinitionType.FROM_CSV.value,
|
29
|
+
const=True,
|
30
|
+
)
|
29
31
|
|
30
32
|
def has_header_row(self) -> bool:
|
31
33
|
return True
|
@@ -35,7 +37,10 @@ class CsvHeaderAutogenerated(BaseModel):
|
|
35
37
|
class Config:
|
36
38
|
title = "Autogenerated"
|
37
39
|
|
38
|
-
header_definition_type:
|
40
|
+
header_definition_type: str = Field(
|
41
|
+
CsvHeaderDefinitionType.AUTOGENERATED.value,
|
42
|
+
const=True,
|
43
|
+
)
|
39
44
|
|
40
45
|
def has_header_row(self) -> bool:
|
41
46
|
return False
|
@@ -45,7 +50,10 @@ class CsvHeaderUserProvided(BaseModel):
|
|
45
50
|
class Config:
|
46
51
|
title = "User Provided"
|
47
52
|
|
48
|
-
header_definition_type:
|
53
|
+
header_definition_type: str = Field(
|
54
|
+
CsvHeaderDefinitionType.USER_PROVIDED.value,
|
55
|
+
const=True,
|
56
|
+
)
|
49
57
|
column_names: List[str] = Field(
|
50
58
|
title="Column Names",
|
51
59
|
description="The column names that will be used while emitting the CSV records",
|
@@ -69,7 +77,10 @@ class CsvFormat(BaseModel):
|
|
69
77
|
class Config:
|
70
78
|
title = "CSV Format"
|
71
79
|
|
72
|
-
filetype:
|
80
|
+
filetype: str = Field(
|
81
|
+
"csv",
|
82
|
+
const=True,
|
83
|
+
)
|
73
84
|
delimiter: str = Field(
|
74
85
|
title="Delimiter",
|
75
86
|
description="The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
|
@@ -2,12 +2,14 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from pydantic import BaseModel
|
6
|
-
from typing_extensions import Literal
|
5
|
+
from pydantic import BaseModel, Field
|
7
6
|
|
8
7
|
|
9
8
|
class JsonlFormat(BaseModel):
|
10
9
|
class Config:
|
11
10
|
title = "Jsonl Format"
|
12
11
|
|
13
|
-
filetype:
|
12
|
+
filetype: str = Field(
|
13
|
+
"jsonl",
|
14
|
+
const=True,
|
15
|
+
)
|
@@ -3,14 +3,16 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from pydantic import BaseModel, Field
|
6
|
-
from typing_extensions import Literal
|
7
6
|
|
8
7
|
|
9
8
|
class ParquetFormat(BaseModel):
|
10
9
|
class Config:
|
11
10
|
title = "Parquet Format"
|
12
11
|
|
13
|
-
filetype:
|
12
|
+
filetype: str = Field(
|
13
|
+
"parquet",
|
14
|
+
const=True,
|
15
|
+
)
|
14
16
|
# This option is not recommended, but necessary for backwards compatibility
|
15
17
|
decimal_as_float: bool = Field(
|
16
18
|
title="Convert Decimal Fields to Floats",
|
@@ -11,6 +11,7 @@ from functools import partial
|
|
11
11
|
from io import IOBase
|
12
12
|
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
|
13
13
|
|
14
|
+
from airbyte_cdk.models import FailureType
|
14
15
|
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
|
15
16
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
16
17
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
@@ -18,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
|
|
18
19
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
19
20
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
20
21
|
from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
|
22
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
21
23
|
|
22
24
|
DIALECT_NAME = "_config_dialect"
|
23
25
|
|
@@ -75,11 +77,12 @@ class _CsvReader:
|
|
75
77
|
if isinstance(config_format.header_definition, CsvHeaderUserProvided):
|
76
78
|
return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
|
77
79
|
|
78
|
-
self._skip_rows(fp, config_format.skip_rows_before_header)
|
79
80
|
if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
|
81
|
+
self._skip_rows(fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header)
|
80
82
|
headers = self._auto_generate_headers(fp, dialect_name)
|
81
83
|
else:
|
82
84
|
# Then read the header
|
85
|
+
self._skip_rows(fp, config_format.skip_rows_before_header)
|
83
86
|
reader = csv.reader(fp, dialect=dialect_name) # type: ignore
|
84
87
|
headers = list(next(reader))
|
85
88
|
|
@@ -141,6 +144,12 @@ class CsvParser(FileTypeParser):
|
|
141
144
|
if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
|
142
145
|
break
|
143
146
|
|
147
|
+
if not type_inferrer_by_field:
|
148
|
+
raise AirbyteTracedException(
|
149
|
+
message=f"Could not infer schema as there are no rows in {file.uri}. If having an empty CSV file is expected, ignore this. "
|
150
|
+
f"Else, please contact Airbyte.",
|
151
|
+
failure_type=FailureType.config_error,
|
152
|
+
)
|
144
153
|
schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()}
|
145
154
|
data_generator.close()
|
146
155
|
return schema
|
@@ -3,7 +3,6 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from datetime import datetime
|
6
|
-
from typing import Optional
|
7
6
|
|
8
7
|
from pydantic import BaseModel
|
9
8
|
|
@@ -15,11 +14,3 @@ class RemoteFile(BaseModel):
|
|
15
14
|
|
16
15
|
uri: str
|
17
16
|
last_modified: datetime
|
18
|
-
|
19
|
-
def extension_agrees_with_file_type(self, file_type: Optional[str]) -> bool:
|
20
|
-
extensions = self.uri.split(".")[1:]
|
21
|
-
if not extensions:
|
22
|
-
return True
|
23
|
-
if not file_type:
|
24
|
-
return True
|
25
|
-
return any(file_type.casefold() in e.casefold() for e in extensions)
|
@@ -14,13 +14,13 @@ airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6
|
|
14
14
|
airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
|
15
15
|
airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=z5Pqxxt3v-JCcJQ6sK4tAz5sg1FB-3wTCd2p85MhFzc,711
|
16
16
|
airbyte_cdk/destinations/vector_db_based/batcher.py,sha256=U2RI0CACZ1WhJIdkC5oPlwZ90OZB40kyFCR5I7StqZw,1160
|
17
|
-
airbyte_cdk/destinations/vector_db_based/config.py,sha256=
|
18
|
-
airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=
|
19
|
-
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=
|
20
|
-
airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=
|
21
|
-
airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=
|
17
|
+
airbyte_cdk/destinations/vector_db_based/config.py,sha256=xv5-IhPG_eKdRxstYmaFBUrYDECevE64OVRyUBZAJJw,4132
|
18
|
+
airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=KHvCSjt6amwpIYxK42OuT1Vh-RCA5A3vEBfAmowXpZI,6161
|
19
|
+
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=0YLm5wmqiwCyUD_GWzqetWclzzKsADOfjXu0jMhQS1Y,6837
|
20
|
+
airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=DMic7D7ie4gGQ-yOgGXGYjBsY8H7X5O5Tz_sCr0ajBU,2327
|
21
|
+
airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
|
22
22
|
airbyte_cdk/destinations/vector_db_based/utils.py,sha256=ngJ6hc9mmzgAEEBd9nuoRcPPFUKijv2CA6zZYUVRm54,240
|
23
|
-
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=
|
23
|
+
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=zSVizVPupTjdF_dwniIU0RYnTZ9TMkizOK48tDNPxxk,3110
|
24
24
|
airbyte_cdk/models/__init__.py,sha256=rDARocDgxf4_qI66Bm6dHTBoecbWguTClGVBmOBiI2o,1674
|
25
25
|
airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
|
26
26
|
airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
|
@@ -148,7 +148,7 @@ airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
148
148
|
airbyte_cdk/sources/file_based/exceptions.py,sha256=4jwHysXT6r2o37Z7ch00nbo45wPVsmCorRYbYTmWd2Q,3656
|
149
149
|
airbyte_cdk/sources/file_based/file_based_source.py,sha256=NCbXAGPWBQSPAf5x2U2eCdOLUd26RhO5s6K87_AF8Es,6931
|
150
150
|
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
|
151
|
-
airbyte_cdk/sources/file_based/remote_file.py,sha256=
|
151
|
+
airbyte_cdk/sources/file_based/remote_file.py,sha256=xIDwDDBPhJI1K8YZuXjEfjxakZPMieBKJM6vmq6G5tw,248
|
152
152
|
airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
|
153
153
|
airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
|
154
154
|
airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGfmQlFUMFR5h3ECc-VzBj4vair6_4WAL87AEI,277
|
@@ -156,17 +156,17 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
|
|
156
156
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
|
157
157
|
airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
|
159
|
-
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=
|
160
|
-
airbyte_cdk/sources/file_based/config/csv_format.py,sha256
|
159
|
+
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=oLJIuNInu-MgjkVFqwHvmQ4CPZa4NZingq_I0_trQ3g,589
|
160
|
+
airbyte_cdk/sources/file_based/config/csv_format.py,sha256=xlBZ5WyAshagjjjbUV_je1JyZ1oY1GbIzJRUZ9UfSvo,7095
|
161
161
|
airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
|
162
|
-
airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=
|
163
|
-
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=
|
162
|
+
airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=usmTeTw8xw8OKwrz8MsiS5E1LQiVEbedGHMHNAfOOlk,252
|
163
|
+
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=O_Eq0yVzjPiKDz8H1-f9yMowtCcJwT9F2prNYpXZkp0,614
|
164
164
|
airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
|
165
165
|
airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=cz9po5Cn6u50uq3hDy46pqnPR4JDcnRItZX9k0WDUJU,520
|
166
166
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
|
167
167
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
|
168
168
|
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
|
169
|
-
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=
|
169
|
+
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=_JQdzZMKkmcOPui7DyrF23twrT6wiXugXyKJEPhi-js,17252
|
170
170
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
|
171
171
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
|
172
172
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
|
@@ -328,13 +328,13 @@ unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9
|
|
328
328
|
unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
|
329
329
|
unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
330
330
|
unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
|
331
|
-
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=
|
331
|
+
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=PalrxCRHAyoIp12IWWyePS9QF4LcvNVkqrKdwkrayJ4,22457
|
332
332
|
unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
|
333
333
|
unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
|
334
334
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
335
335
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
|
336
336
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
|
337
|
-
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=
|
337
|
+
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=_5FYtChp1B8D_6gHbmyDNm19Aa9rCk4JDm7u47p-W3M,98717
|
338
338
|
unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
|
339
339
|
unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
|
340
340
|
unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
|
@@ -365,8 +365,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
365
365
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
366
366
|
unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
|
367
367
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
368
|
-
airbyte_cdk-0.51.
|
369
|
-
airbyte_cdk-0.51.
|
370
|
-
airbyte_cdk-0.51.
|
371
|
-
airbyte_cdk-0.51.
|
372
|
-
airbyte_cdk-0.51.
|
368
|
+
airbyte_cdk-0.51.12.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
369
|
+
airbyte_cdk-0.51.12.dist-info/METADATA,sha256=kP39_c0A5hJ-e8yU-oZ-zAbknbhsFKaz7I11AoLyh5o,9895
|
370
|
+
airbyte_cdk-0.51.12.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
371
|
+
airbyte_cdk-0.51.12.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
372
|
+
airbyte_cdk-0.51.12.dist-info/RECORD,,
|
@@ -13,6 +13,7 @@ from unittest import TestCase, mock
|
|
13
13
|
from unittest.mock import Mock
|
14
14
|
|
15
15
|
import pytest
|
16
|
+
from airbyte_cdk.models import FailureType
|
16
17
|
from airbyte_cdk.sources.file_based.config.csv_format import (
|
17
18
|
DEFAULT_FALSE_VALUES,
|
18
19
|
DEFAULT_TRUE_VALUES,
|
@@ -26,6 +27,7 @@ from airbyte_cdk.sources.file_based.exceptions import RecordParseError
|
|
26
27
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
27
28
|
from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _CsvReader
|
28
29
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
30
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
29
31
|
|
30
32
|
PROPERTY_TYPES = {
|
31
33
|
"col1": "null",
|
@@ -169,7 +171,7 @@ class SchemaInferenceTestCase(TestCase):
|
|
169
171
|
self._config.get_input_schema.return_value = None
|
170
172
|
self._config.format = self._config_format
|
171
173
|
|
172
|
-
self._file =
|
174
|
+
self._file = RemoteFile(uri="a uri", last_modified=datetime.now())
|
173
175
|
self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
|
174
176
|
self._logger = Mock(spec=logging.Logger)
|
175
177
|
self._csv_reader = Mock(spec=_CsvReader)
|
@@ -222,6 +224,12 @@ class SchemaInferenceTestCase(TestCase):
|
|
222
224
|
# since the type is number, we know the string at the end was not considered
|
223
225
|
assert inferred_schema == {self._HEADER_NAME: {"type": "number"}}
|
224
226
|
|
227
|
+
def test_given_empty_csv_file_when_infer_schema_then_raise_config_error(self) -> None:
|
228
|
+
self._csv_reader.read_data.return_value = []
|
229
|
+
with pytest.raises(AirbyteTracedException) as exception:
|
230
|
+
self._infer_schema()
|
231
|
+
assert exception.value.failure_type == FailureType.config_error
|
232
|
+
|
225
233
|
def _test_infer_schema(self, rows: List[str], expected_type: str) -> None:
|
226
234
|
self._csv_reader.read_data.return_value = ({self._HEADER_NAME: row} for row in rows)
|
227
235
|
inferred_schema = self._infer_schema()
|
@@ -260,7 +268,7 @@ class CsvReaderTest(unittest.TestCase):
|
|
260
268
|
self._config.name = self._CONFIG_NAME
|
261
269
|
self._config.format = self._config_format
|
262
270
|
|
263
|
-
self._file =
|
271
|
+
self._file = RemoteFile(uri="a uri", last_modified=datetime.now())
|
264
272
|
self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
|
265
273
|
self._logger = Mock(spec=logging.Logger)
|
266
274
|
self._csv_reader = _CsvReader()
|
@@ -292,6 +300,21 @@ class CsvReaderTest(unittest.TestCase):
|
|
292
300
|
|
293
301
|
assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
|
294
302
|
|
303
|
+
def test_given_skip_row_before_and_after_and_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
|
304
|
+
self._config_format.header_definition = CsvHeaderAutogenerated()
|
305
|
+
self._config_format.skip_rows_before_header = 1
|
306
|
+
self._config_format.skip_rows_after_header = 2
|
307
|
+
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data([
|
308
|
+
"skip before",
|
309
|
+
"skip after 1",
|
310
|
+
"skip after 2",
|
311
|
+
"0,1,2,3,4,5,6"
|
312
|
+
]).build()
|
313
|
+
|
314
|
+
data_generator = self._read_data()
|
315
|
+
|
316
|
+
assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
|
317
|
+
|
295
318
|
def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
|
296
319
|
self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
|
297
320
|
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
|
@@ -111,7 +111,7 @@ single_csv_scenario = (
|
|
111
111
|
"title": "Avro Format",
|
112
112
|
"type": "object",
|
113
113
|
"properties": {
|
114
|
-
"filetype": {"title": "Filetype", "default": "avro", "
|
114
|
+
"filetype": {"title": "Filetype", "default": "avro", "const": "avro", "type": "string"},
|
115
115
|
"double_as_string": {
|
116
116
|
"title": "Convert Double Fields to Strings",
|
117
117
|
"description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.",
|
@@ -124,7 +124,7 @@ single_csv_scenario = (
|
|
124
124
|
"title": "CSV Format",
|
125
125
|
"type": "object",
|
126
126
|
"properties": {
|
127
|
-
"filetype": {"title": "Filetype", "default": "csv", "
|
127
|
+
"filetype": {"title": "Filetype", "default": "csv", "const": "csv", "type": "string"},
|
128
128
|
"delimiter": {
|
129
129
|
"title": "Delimiter",
|
130
130
|
"description": "The character delimiting individual cells in the CSV data. This may only be a 1-character string. For tab-delimited data enter '\\t'.",
|
@@ -190,21 +190,21 @@ single_csv_scenario = (
|
|
190
190
|
"title": "From CSV",
|
191
191
|
"type": "object",
|
192
192
|
"properties": {
|
193
|
-
"header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "
|
193
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "const": "From CSV", "type": "string"},
|
194
194
|
},
|
195
195
|
},
|
196
196
|
{
|
197
197
|
"title": "Autogenerated",
|
198
198
|
"type": "object",
|
199
199
|
"properties": {
|
200
|
-
"header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "
|
200
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "const": "Autogenerated", "type": "string"},
|
201
201
|
},
|
202
202
|
},
|
203
203
|
{
|
204
204
|
"title": "User Provided",
|
205
205
|
"type": "object",
|
206
206
|
"properties": {
|
207
|
-
"header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "
|
207
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "const": "User Provided", "type": "string"},
|
208
208
|
"column_names": {
|
209
209
|
"title": "Column Names",
|
210
210
|
"description": "The column names that will be used while emitting the CSV records",
|
@@ -247,7 +247,7 @@ single_csv_scenario = (
|
|
247
247
|
"title": "Jsonl Format",
|
248
248
|
"type": "object",
|
249
249
|
"properties": {
|
250
|
-
"filetype": {"title": "Filetype", "default": "jsonl", "
|
250
|
+
"filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
|
251
251
|
},
|
252
252
|
},
|
253
253
|
{
|
@@ -257,7 +257,7 @@ single_csv_scenario = (
|
|
257
257
|
"filetype": {
|
258
258
|
"title": "Filetype",
|
259
259
|
"default": "parquet",
|
260
|
-
"
|
260
|
+
"const": "parquet",
|
261
261
|
"type": "string",
|
262
262
|
},
|
263
263
|
"decimal_as_float": {
|
File without changes
|
File without changes
|
File without changes
|