airbyte-cdk 0.51.11__py3-none-any.whl → 0.51.12__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -59,6 +59,22 @@ class FakeEmbeddingConfigModel(BaseModel):
59
59
  }
60
60
 
61
61
 
62
+ class FromFieldEmbeddingConfigModel(BaseModel):
63
+ mode: Literal["from_field"] = Field("from_field", const=True)
64
+ field_name: str = Field(
65
+ ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
66
+ )
67
+ dimensions: int = Field(
68
+ ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
69
+ )
70
+
71
+ class Config:
72
+ title = "From Field"
73
+ schema_extra = {
74
+ "description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
75
+ }
76
+
77
+
62
78
  class CohereEmbeddingConfigModel(BaseModel):
63
79
  mode: Literal["cohere"] = Field("cohere", const=True)
64
80
  cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
9
9
  import dpath.util
10
10
  from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
11
11
  from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
12
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
12
13
  from langchain.document_loaders.base import Document
13
14
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
15
  from langchain.utils import stringify_dict
@@ -21,8 +22,8 @@ METADATA_RECORD_ID_FIELD = "_ab_record_id"
21
22
  class Chunk:
22
23
  page_content: str
23
24
  metadata: Dict[str, Any]
24
- stream: str
25
- namespace: Optional[str] = None
25
+ record: AirbyteRecordMessage
26
+ embedding: Optional[List[float]] = None
26
27
 
27
28
 
28
29
  class DocumentProcessor:
@@ -66,11 +67,14 @@ class DocumentProcessor:
66
67
  """
67
68
  doc = self._generate_document(record)
68
69
  if doc is None:
69
- raise ValueError(f"Record {str(record.data)[:250]}... does not contain any text fields.")
70
- chunks = [
71
- Chunk(
72
- page_content=chunk_document.page_content, metadata=chunk_document.metadata, stream=record.stream, namespace=record.namespace
70
+ text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields"
71
+ raise AirbyteTracedException(
72
+ internal_message="No text fields found in record",
73
+ message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.",
74
+ failure_type=FailureType.config_error,
73
75
  )
76
+ chunks = [
77
+ Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
74
78
  for chunk_document in self._split_document(doc)
75
79
  ]
76
80
  id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
@@ -5,8 +5,15 @@
5
5
  from abc import ABC, abstractmethod
6
6
  from typing import List, Optional
7
7
 
8
- from airbyte_cdk.destinations.vector_db_based.config import CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, OpenAIEmbeddingConfigModel
8
+ from airbyte_cdk.destinations.vector_db_based.config import (
9
+ CohereEmbeddingConfigModel,
10
+ FakeEmbeddingConfigModel,
11
+ FromFieldEmbeddingConfigModel,
12
+ OpenAIEmbeddingConfigModel,
13
+ )
14
+ from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
9
15
  from airbyte_cdk.destinations.vector_db_based.utils import format_exception
16
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
10
17
  from langchain.embeddings.cohere import CohereEmbeddings
11
18
  from langchain.embeddings.fake import FakeEmbeddings
12
19
  from langchain.embeddings.openai import OpenAIEmbeddings
@@ -17,7 +24,7 @@ class Embedder(ABC):
17
24
  Embedder is an abstract class that defines the interface for embedding text.
18
25
 
19
26
  The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination.
20
- The destination connector is responsible to create an embedder instance and pass it to the indexer.
27
+ The destination connector is responsible to create an embedder instance and pass it to the writer.
21
28
  The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed.
22
29
  """
23
30
 
@@ -29,7 +36,11 @@ class Embedder(ABC):
29
36
  pass
30
37
 
31
38
  @abstractmethod
32
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
39
+ def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
40
+ """
41
+ Embed the text of each chunk and return the resulting embedding vectors.
42
+ If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
43
+ """
33
44
  pass
34
45
 
35
46
  @property
@@ -54,8 +65,8 @@ class OpenAIEmbedder(Embedder):
54
65
  return format_exception(e)
55
66
  return None
56
67
 
57
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
58
- return self.embeddings.embed_documents(texts)
68
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
69
+ return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
59
70
 
60
71
  @property
61
72
  def embedding_dimensions(self) -> int:
@@ -79,8 +90,8 @@ class CohereEmbedder(Embedder):
79
90
  return format_exception(e)
80
91
  return None
81
92
 
82
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
83
- return self.embeddings.embed_documents(texts)
93
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
94
+ return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
84
95
 
85
96
  @property
86
97
  def embedding_dimensions(self) -> int:
@@ -100,10 +111,54 @@ class FakeEmbedder(Embedder):
100
111
  return format_exception(e)
101
112
  return None
102
113
 
103
- def embed_texts(self, texts: List[str]) -> List[List[float]]:
104
- return self.embeddings.embed_documents(texts)
114
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
115
+ return self.embeddings.embed_documents([chunk.page_content for chunk in chunks])
105
116
 
106
117
  @property
107
118
  def embedding_dimensions(self) -> int:
108
119
  # use same vector size as for OpenAI embeddings to keep it realistic
109
120
  return OPEN_AI_VECTOR_SIZE
121
+
122
+
123
+ class FromFieldEmbedder(Embedder):
124
+ def __init__(self, config: FromFieldEmbeddingConfigModel):
125
+ super().__init__()
126
+ self.config = config
127
+
128
+ def check(self) -> Optional[str]:
129
+ return None
130
+
131
+ def embed_chunks(self, chunks: List[Chunk]) -> List[List[float]]:
132
+ """
133
+ From each chunk, pull the embedding from the field specified in the config.
134
+ Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
135
+ """
136
+ embeddings = []
137
+ for chunk in chunks:
138
+ data = chunk.record.data
139
+ if self.config.field_name not in data:
140
+ raise AirbyteTracedException(
141
+ internal_message="Embedding vector field not found",
142
+ failure_type=FailureType.config_error,
143
+ message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
144
+ )
145
+ field = data[self.config.field_name]
146
+ if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
147
+ raise AirbyteTracedException(
148
+ internal_message="Embedding vector field not a list of numbers",
149
+ failure_type=FailureType.config_error,
150
+ message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
151
+ )
152
+ if len(field) != self.config.dimensions:
153
+ raise AirbyteTracedException(
154
+ internal_message="Embedding vector field has wrong length",
155
+ failure_type=FailureType.config_error,
156
+ message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
157
+ )
158
+ embeddings.append(field)
159
+
160
+ return embeddings
161
+
162
+ @property
163
+ def embedding_dimensions(self) -> int:
164
+ return self.config.dimensions
@@ -7,7 +7,6 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar
8
8
 
9
9
  from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
10
- from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
11
10
  from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog
12
11
 
13
12
 
@@ -19,9 +18,8 @@ class Indexer(ABC):
19
18
  In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods.
20
19
  """
21
20
 
22
- def __init__(self, config: Any, embedder: Embedder):
21
+ def __init__(self, config: Any):
23
22
  self.config = config
24
- self.embedder = embedder
25
23
  pass
26
24
 
27
25
  def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None:
@@ -48,6 +48,6 @@ class BaseIntegrationTest(unittest.TestCase):
48
48
  type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
49
49
  )
50
50
 
51
- def setUp(self):
51
+ def setUp(self) -> None:
52
52
  with open("secrets/config.json", "r") as f:
53
53
  self.config = json.loads(f.read())
@@ -8,24 +8,27 @@ from typing import Iterable, List
8
8
  from airbyte_cdk.destinations.vector_db_based.batcher import Batcher
9
9
  from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
10
10
  from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
11
+ from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
11
12
  from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
12
13
  from airbyte_cdk.models import AirbyteMessage, AirbyteRecordMessage, ConfiguredAirbyteCatalog, Type
13
14
 
14
15
 
15
16
  class Writer:
16
17
  """
17
- The Writer class is orchestrating the document processor, the batcher and the indexer:
18
+ The Writer class is orchestrating the document processor, the batcher, the embedder and the indexer:
18
19
  * Incoming records are collected using the batcher
19
20
  * The document processor generates documents from all records in the batch
20
- * The indexer indexes the resulting documents in the destination
21
+ * The embedder embeds the documents
22
+ * The indexer indexes the resulting documents and their embeddings in the destination
21
23
 
22
24
  The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
23
25
  The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
24
26
  """
25
27
 
26
- def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, batch_size: int) -> None:
28
+ def __init__(self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int) -> None:
27
29
  self.processing_config = processing_config
28
30
  self.indexer = indexer
31
+ self.embedder = embedder
29
32
  self.batcher = Batcher(batch_size, lambda batch: self._process_batch(batch))
30
33
 
31
34
  def _process_batch(self, batch: List[AirbyteRecordMessage]) -> None:
@@ -36,6 +39,9 @@ class Writer:
36
39
  documents.extend(record_documents)
37
40
  if record_id_to_delete is not None:
38
41
  ids_to_delete.append(record_id_to_delete)
42
+ embeddings = self.embedder.embed_chunks(documents)
43
+ for i, document in enumerate(documents):
44
+ document.embedding = embeddings[i]
39
45
  self.indexer.index(documents, ids_to_delete)
40
46
 
41
47
  def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.51.11
3
+ Version: 0.51.12
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -14,13 +14,13 @@ airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6
14
14
  airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
15
15
  airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=z5Pqxxt3v-JCcJQ6sK4tAz5sg1FB-3wTCd2p85MhFzc,711
16
16
  airbyte_cdk/destinations/vector_db_based/batcher.py,sha256=U2RI0CACZ1WhJIdkC5oPlwZ90OZB40kyFCR5I7StqZw,1160
17
- airbyte_cdk/destinations/vector_db_based/config.py,sha256=JrK2yuxWGaBa2PBP8WUX8WMG2ah6ah-Z75nWCeoGm_A,3407
18
- airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=D4L2NiMLv-Wy2FuYjluk-tuHljmSWMYGLHPNwoMi_0c,5716
19
- airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=mB-VOVB3-DCKEtKCNhcuLH9OFFcyuP3yNCUf37OtV1M,3640
20
- airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=WFSPGJsvCRSqdZkgeM2RCSD8VPgHOnJmKbXSNK5XZos,2451
21
- airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=Ldf-nJYCCzemP0bkjMQiRiApNh252cBs8PHKlFabG6o,1799
17
+ airbyte_cdk/destinations/vector_db_based/config.py,sha256=xv5-IhPG_eKdRxstYmaFBUrYDECevE64OVRyUBZAJJw,4132
18
+ airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=KHvCSjt6amwpIYxK42OuT1Vh-RCA5A3vEBfAmowXpZI,6161
19
+ airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=0YLm5wmqiwCyUD_GWzqetWclzzKsADOfjXu0jMhQS1Y,6837
20
+ airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=DMic7D7ie4gGQ-yOgGXGYjBsY8H7X5O5Tz_sCr0ajBU,2327
21
+ airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
22
22
  airbyte_cdk/destinations/vector_db_based/utils.py,sha256=ngJ6hc9mmzgAEEBd9nuoRcPPFUKijv2CA6zZYUVRm54,240
23
- airbyte_cdk/destinations/vector_db_based/writer.py,sha256=uf3QLEKwaJsuGuhbfaJ4qbvr8EXc4NXr4J3JERP5dtI,2756
23
+ airbyte_cdk/destinations/vector_db_based/writer.py,sha256=zSVizVPupTjdF_dwniIU0RYnTZ9TMkizOK48tDNPxxk,3110
24
24
  airbyte_cdk/models/__init__.py,sha256=rDARocDgxf4_qI66Bm6dHTBoecbWguTClGVBmOBiI2o,1674
25
25
  airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
26
26
  airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
@@ -365,8 +365,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
365
365
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
366
366
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
367
367
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
368
- airbyte_cdk-0.51.11.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
369
- airbyte_cdk-0.51.11.dist-info/METADATA,sha256=2yDh-4NTXY4ALKd3svXb7IoBxTefflum6uw2szW9cjw,9895
370
- airbyte_cdk-0.51.11.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
371
- airbyte_cdk-0.51.11.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
372
- airbyte_cdk-0.51.11.dist-info/RECORD,,
368
+ airbyte_cdk-0.51.12.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
369
+ airbyte_cdk-0.51.12.dist-info/METADATA,sha256=kP39_c0A5hJ-e8yU-oZ-zAbknbhsFKaz7I11AoLyh5o,9895
370
+ airbyte_cdk-0.51.12.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
371
+ airbyte_cdk-0.51.12.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
372
+ airbyte_cdk-0.51.12.dist-info/RECORD,,