airbyte-cdk 0.51.11__py3-none-any.whl → 0.51.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/destinations/vector_db_based/config.py +16 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +10 -6
- airbyte_cdk/destinations/vector_db_based/embedder.py +64 -9
- airbyte_cdk/destinations/vector_db_based/indexer.py +1 -3
- airbyte_cdk/destinations/vector_db_based/test_utils.py +1 -1
- airbyte_cdk/destinations/vector_db_based/writer.py +9 -3
- {airbyte_cdk-0.51.11.dist-info → airbyte_cdk-0.51.12.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.51.11.dist-info → airbyte_cdk-0.51.12.dist-info}/RECORD +11 -11
- {airbyte_cdk-0.51.11.dist-info → airbyte_cdk-0.51.12.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.51.11.dist-info → airbyte_cdk-0.51.12.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.51.11.dist-info → airbyte_cdk-0.51.12.dist-info}/top_level.txt +0 -0
@@ -59,6 +59,22 @@ class FakeEmbeddingConfigModel(BaseModel):
|
|
59
59
|
}
|
60
60
|
|
61
61
|
|
62
|
+
class FromFieldEmbeddingConfigModel(BaseModel):
|
63
|
+
mode: Literal["from_field"] = Field("from_field", const=True)
|
64
|
+
field_name: str = Field(
|
65
|
+
..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
|
66
|
+
)
|
67
|
+
dimensions: int = Field(
|
68
|
+
..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
|
69
|
+
)
|
70
|
+
|
71
|
+
class Config:
|
72
|
+
title = "From Field"
|
73
|
+
schema_extra = {
|
74
|
+
"description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
|
75
|
+
}
|
76
|
+
|
77
|
+
|
62
78
|
class CohereEmbeddingConfigModel(BaseModel):
|
63
79
|
mode: Literal["cohere"] = Field("cohere", const=True)
|
64
80
|
cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
|
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
|
9
9
|
import dpath.util
|
10
10
|
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
11
11
|
from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
|
12
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
12
13
|
from langchain.document_loaders.base import Document
|
13
14
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14
15
|
from langchain.utils import stringify_dict
|
@@ -21,8 +22,8 @@ METADATA_RECORD_ID_FIELD = "_ab_record_id"
|
|
21
22
|
class Chunk:
|
22
23
|
page_content: str
|
23
24
|
metadata: Dict[str, Any]
|
24
|
-
|
25
|
-
|
25
|
+
record: AirbyteRecordMessage
|
26
|
+
embedding: Optional[List[float]] = None
|
26
27
|
|
27
28
|
|
28
29
|
class DocumentProcessor:
|
@@ -66,11 +67,14 @@ class DocumentProcessor:
|
|
66
67
|
"""
|
67
68
|
doc = self._generate_document(record)
|
68
69
|
if doc is None:
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
70
|
+
text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields"
|
71
|
+
raise AirbyteTracedException(
|
72
|
+
internal_message="No text fields found in record",
|
73
|
+
message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.",
|
74
|
+
failure_type=FailureType.config_error,
|
73
75
|
)
|
76
|
+
chunks = [
|
77
|
+
Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
|
74
78
|
for chunk_document in self._split_document(doc)
|
75
79
|
]
|
76
80
|
id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
|
@@ -5,8 +5,15 @@
|
|
5
5
|
from abc import ABC, abstractmethod
|
6
6
|
from typing import List, Optional
|
7
7
|
|
8
|
-
from airbyte_cdk.destinations.vector_db_based.config import
|
8
|
+
from airbyte_cdk.destinations.vector_db_based.config import (
|
9
|
+
CohereEmbeddingConfigModel,
|
10
|
+
FakeEmbeddingConfigModel,
|
11
|
+
FromFieldEmbeddingConfigModel,
|
12
|
+
OpenAIEmbeddingConfigModel,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
9
15
|
from airbyte_cdk.destinations.vector_db_based.utils import format_exception
|
16
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
10
17
|
from langchain.embeddings.cohere import CohereEmbeddings
|
11
18
|
from langchain.embeddings.fake import FakeEmbeddings
|
12
19
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
@@ -17,7 +24,7 @@ class Embedder(ABC):
|
|
17
24
|
Embedder is an abstract class that defines the interface for embedding text.
|
18
25
|
|
19
26
|
The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination.
|
20
|
-
The destination connector is responsible to create an embedder instance and pass it to the
|
27
|
+
The destination connector is responsible to create an embedder instance and pass it to the writer.
|
21
28
|
The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed.
|
22
29
|
"""
|
23
30
|
|
@@ -29,7 +36,11 @@ class Embedder(ABC):
|
|
29
36
|
pass
|
30
37
|
|
31
38
|
@abstractmethod
|
32
|
-
def
|
39
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
|
40
|
+
"""
|
41
|
+
Embed the text of each chunk and return the resulting embedding vectors.
|
42
|
+
If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
|
43
|
+
"""
|
33
44
|
pass
|
34
45
|
|
35
46
|
@property
|
@@ -54,8 +65,8 @@ class OpenAIEmbedder(Embedder):
|
|
54
65
|
return format_exception(e)
|
55
66
|
return None
|
56
67
|
|
57
|
-
def
|
58
|
-
return self.embeddings.embed_documents(
|
68
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
69
|
+
return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
|
59
70
|
|
60
71
|
@property
|
61
72
|
def embedding_dimensions(self) -> int:
|
@@ -79,8 +90,8 @@ class CohereEmbedder(Embedder):
|
|
79
90
|
return format_exception(e)
|
80
91
|
return None
|
81
92
|
|
82
|
-
def
|
83
|
-
return self.embeddings.embed_documents(
|
93
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
94
|
+
return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
|
84
95
|
|
85
96
|
@property
|
86
97
|
def embedding_dimensions(self) -> int:
|
@@ -100,10 +111,54 @@ class FakeEmbedder(Embedder):
|
|
100
111
|
return format_exception(e)
|
101
112
|
return None
|
102
113
|
|
103
|
-
def
|
104
|
-
return self.embeddings.embed_documents(
|
114
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
115
|
+
return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
|
105
116
|
|
106
117
|
@property
|
107
118
|
def embedding_dimensions(self) -> int:
|
108
119
|
# use same vector size as for OpenAI embeddings to keep it realistic
|
109
120
|
return OPEN_AI_VECTOR_SIZE
|
121
|
+
|
122
|
+
|
123
|
+
class FromFieldEmbedder(Embedder):
|
124
|
+
def __init__(self, config: FromFieldEmbeddingConfigModel):
|
125
|
+
super().__init__()
|
126
|
+
self.config = config
|
127
|
+
|
128
|
+
def check(self) -> Optional[str]:
|
129
|
+
return None
|
130
|
+
|
131
|
+
def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
|
132
|
+
"""
|
133
|
+
From each chunk, pull the embedding from the field specified in the config.
|
134
|
+
Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
|
135
|
+
"""
|
136
|
+
embeddings = []
|
137
|
+
for chunk in chunks:
|
138
|
+
data = chunk.record.data
|
139
|
+
if self.config.field_name not in data:
|
140
|
+
raise AirbyteTracedException(
|
141
|
+
internal_message="Embedding vector field not found",
|
142
|
+
failure_type=FailureType.config_error,
|
143
|
+
message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
|
144
|
+
)
|
145
|
+
field = data[self.config.field_name]
|
146
|
+
if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
|
147
|
+
raise AirbyteTracedException(
|
148
|
+
internal_message="Embedding vector field not a list of numbers",
|
149
|
+
failure_type=FailureType.config_error,
|
150
|
+
message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
151
|
+
)
|
152
|
+
if len(field) != self.config.dimensions:
|
153
|
+
raise AirbyteTracedException(
|
154
|
+
internal_message="Embedding vector field has wrong length",
|
155
|
+
failure_type=FailureType.config_error,
|
156
|
+
message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
157
|
+
)
|
158
|
+
embeddings.append(field)
|
159
|
+
|
160
|
+
return embeddings
|
161
|
+
|
162
|
+
@property
|
163
|
+
def embedding_dimensions(self) -> int:
|
164
|
+
return self.config.dimensions
|
@@ -7,7 +7,6 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar
|
8
8
|
|
9
9
|
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
10
|
-
from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
|
11
10
|
from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog
|
12
11
|
|
13
12
|
|
@@ -19,9 +18,8 @@ class Indexer(ABC):
|
|
19
18
|
In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods.
|
20
19
|
"""
|
21
20
|
|
22
|
-
def __init__(self, config: Any
|
21
|
+
def __init__(self, config: Any):
|
23
22
|
self.config = config
|
24
|
-
self.embedder = embedder
|
25
23
|
pass
|
26
24
|
|
27
25
|
def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None:
|
@@ -48,6 +48,6 @@ class BaseIntegrationTest(unittest.TestCase):
|
|
48
48
|
type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
|
49
49
|
)
|
50
50
|
|
51
|
-
def setUp(self):
|
51
|
+
def setUp(self) -> None:
|
52
52
|
with open("secrets/config.json", "r") as f:
|
53
53
|
self.config = json.loads(f.read())
|
@@ -8,24 +8,27 @@ from typing import Iterable, List
|
|
8
8
|
from airbyte_cdk.destinations.vector_db_based.batcher import Batcher
|
9
9
|
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
10
10
|
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
|
11
|
+
from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
|
11
12
|
from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
|
12
13
|
from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, ConfiguredAirbyteCatalog, Type
|
13
14
|
|
14
15
|
|
15
16
|
class Writer:
|
16
17
|
"""
|
17
|
-
The Writer class is orchestrating the document processor, the batcher and the indexer:
|
18
|
+
The Writer class is orchestrating the document processor, the batcher, the embedder and the indexer:
|
18
19
|
* Incoming records are collected using the batcher
|
19
20
|
* The document processor generates documents from all records in the batch
|
20
|
-
* The
|
21
|
+
* The embedder embeds the documents
|
22
|
+
* The indexer indexes the resulting documents and their embeddings in the destination
|
21
23
|
|
22
24
|
The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
|
23
25
|
The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
|
24
26
|
"""
|
25
27
|
|
26
|
-
def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, batch_size: int) -> None:
|
28
|
+
def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int) -> None:
|
27
29
|
self.processing_config = processing_config
|
28
30
|
self.indexer = indexer
|
31
|
+
self.embedder = embedder
|
29
32
|
self.batcher = Batcher(batch_size, lambda batch: self._process_batch(batch))
|
30
33
|
|
31
34
|
def _process_batch(self, batch: List[AirbyteRecordMessage]) -> None:
|
@@ -36,6 +39,9 @@ class Writer:
|
|
36
39
|
documents.extend(record_documents)
|
37
40
|
if record_id_to_delete is not None:
|
38
41
|
ids_to_delete.append(record_id_to_delete)
|
42
|
+
embeddings = self.embedder.embed_chunks(documents)
|
43
|
+
for i, document in enumerate(documents):
|
44
|
+
document.embedding = embeddings[i]
|
39
45
|
self.indexer.index(documents, ids_to_delete)
|
40
46
|
|
41
47
|
def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
|
@@ -14,13 +14,13 @@ airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6
|
|
14
14
|
airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
|
15
15
|
airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=z5Pqxxt3v-JCcJQ6sK4tAz5sg1FB-3wTCd2p85MhFzc,711
|
16
16
|
airbyte_cdk/destinations/vector_db_based/batcher.py,sha256=U2RI0CACZ1WhJIdkC5oPlwZ90OZB40kyFCR5I7StqZw,1160
|
17
|
-
airbyte_cdk/destinations/vector_db_based/config.py,sha256=
|
18
|
-
airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=
|
19
|
-
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=
|
20
|
-
airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=
|
21
|
-
airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=
|
17
|
+
airbyte_cdk/destinations/vector_db_based/config.py,sha256=xv5-IhPG_eKdRxstYmaFBUrYDECevE64OVRyUBZAJJw,4132
|
18
|
+
airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=KHvCSjt6amwpIYxK42OuT1Vh-RCA5A3vEBfAmowXpZI,6161
|
19
|
+
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=0YLm5wmqiwCyUD_GWzqetWclzzKsADOfjXu0jMhQS1Y,6837
|
20
|
+
airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=DMic7D7ie4gGQ-yOgGXGYjBsY8H7X5O5Tz_sCr0ajBU,2327
|
21
|
+
airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
|
22
22
|
airbyte_cdk/destinations/vector_db_based/utils.py,sha256=ngJ6hc9mmzgAEEBd9nuoRcPPFUKijv2CA6zZYUVRm54,240
|
23
|
-
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=
|
23
|
+
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=zSVizVPupTjdF_dwniIU0RYnTZ9TMkizOK48tDNPxxk,3110
|
24
24
|
airbyte_cdk/models/__init__.py,sha256=rDARocDgxf4_qI66Bm6dHTBoecbWguTClGVBmOBiI2o,1674
|
25
25
|
airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
|
26
26
|
airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
|
@@ -365,8 +365,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
365
365
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
366
366
|
unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
|
367
367
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
368
|
-
airbyte_cdk-0.51.
|
369
|
-
airbyte_cdk-0.51.
|
370
|
-
airbyte_cdk-0.51.
|
371
|
-
airbyte_cdk-0.51.
|
372
|
-
airbyte_cdk-0.51.
|
368
|
+
airbyte_cdk-0.51.12.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
369
|
+
airbyte_cdk-0.51.12.dist-info/METADATA,sha256=kP39_c0A5hJ-e8yU-oZ-zAbknbhsFKaz7I11AoLyh5o,9895
|
370
|
+
airbyte_cdk-0.51.12.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
371
|
+
airbyte_cdk-0.51.12.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
372
|
+
airbyte_cdk-0.51.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|