ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ragbits/document_search/__init__.py +3 -0
  2. ragbits/document_search/_main.py +273 -0
  3. ragbits/document_search/cli.py +109 -0
  4. ragbits/document_search/documents/__init__.py +0 -0
  5. ragbits/document_search/documents/document.py +203 -0
  6. ragbits/document_search/documents/element.py +208 -0
  7. ragbits/document_search/ingestion/__init__.py +0 -0
  8. ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
  9. ragbits/document_search/ingestion/enrichers/base.py +64 -0
  10. ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
  11. ragbits/document_search/ingestion/enrichers/image.py +107 -0
  12. ragbits/document_search/ingestion/enrichers/router.py +86 -0
  13. ragbits/document_search/ingestion/parsers/__init__.py +9 -0
  14. ragbits/document_search/ingestion/parsers/base.py +97 -0
  15. ragbits/document_search/ingestion/parsers/docling.py +178 -0
  16. ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
  17. ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
  18. ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
  19. ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
  20. ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
  21. ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
  22. ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
  23. ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
  24. ragbits/document_search/ingestion/parsers/router.py +90 -0
  25. ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
  26. ragbits/document_search/ingestion/strategies/__init__.py +6 -0
  27. ragbits/document_search/ingestion/strategies/base.py +290 -0
  28. ragbits/document_search/ingestion/strategies/batched.py +261 -0
  29. ragbits/document_search/ingestion/strategies/ray.py +138 -0
  30. ragbits/document_search/ingestion/strategies/sequential.py +23 -0
  31. ragbits/document_search/py.typed +0 -0
  32. ragbits/document_search/retrieval/__init__.py +0 -0
  33. ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
  34. ragbits/document_search/retrieval/rephrasers/base.py +39 -0
  35. ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
  36. ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
  37. ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
  38. ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
  39. ragbits/document_search/retrieval/rerankers/base.py +56 -0
  40. ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
  41. ragbits/document_search/retrieval/rerankers/llm.py +177 -0
  42. ragbits/document_search/retrieval/rerankers/noop.py +34 -0
  43. ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
  44. ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
  45. ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
  46. ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
@@ -0,0 +1,90 @@
1
+ from collections.abc import Mapping
2
+ from typing import ClassVar
3
+
4
+ from typing_extensions import Self
5
+
6
+ from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
7
+ from ragbits.document_search.documents.document import DocumentType
8
+ from ragbits.document_search.ingestion.parsers.base import DocumentParser
9
+ from ragbits.document_search.ingestion.parsers.exceptions import ParserNotFoundError
10
+
11
+
12
+ class DocumentParserRouter(WithConstructionConfig):
13
+ """
14
+ The class responsible for routing the document to the correct parser based on the document type.
15
+ """
16
+
17
+ configuration_key: ClassVar[str] = "parser_router"
18
+
19
+ _parsers: Mapping[DocumentType, DocumentParser]
20
+
21
+ def __init__(self, parsers: Mapping[DocumentType, DocumentParser] | None = None) -> None:
22
+ """
23
+ Initialize the DocumentParserRouter instance.
24
+
25
+ Args:
26
+ parsers: The mapping of document types and their parsers. To override default Unstructured parsers.
27
+ """
28
+ self._parsers = {**self._get_default_parsers(), **parsers} if parsers else self._get_default_parsers()
29
+
30
+ @classmethod
31
+ def from_config(cls, config: dict[str, ObjectConstructionConfig]) -> Self:
32
+ """
33
+ Initialize the class with the provided configuration.
34
+
35
+ Args:
36
+ config: A dictionary containing configuration details for the class.
37
+
38
+ Returns:
39
+ The DocumentParserRouter.
40
+
41
+ Raises:
42
+ InvalidConfigError: If any of the provided parsers cannot be initialized.
43
+ """
44
+ parsers = {
45
+ DocumentType(document_type): DocumentParser.subclass_from_config(parser_config)
46
+ for document_type, parser_config in config.items()
47
+ }
48
+ return super().from_config({"parsers": parsers})
49
+
50
+ def get(self, document_type: DocumentType) -> DocumentParser:
51
+ """
52
+ Get the parser for the document.
53
+
54
+ Args:
55
+ document_type: The document type.
56
+
57
+ Returns:
58
+ The parser for processing the document.
59
+
60
+ Raises:
61
+ ParserNotFoundError: If no parser is found for the document type.
62
+ """
63
+ parser = self._parsers.get(document_type)
64
+
65
+ if isinstance(parser, DocumentParser):
66
+ return parser
67
+
68
+ raise ParserNotFoundError(document_type)
69
+
70
+ @staticmethod
71
+ def _get_default_parsers() -> dict[DocumentType, DocumentParser]:
72
+ """
73
+ Get the default parsers.
74
+ """
75
+ from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
76
+ from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser
77
+
78
+ _default_parser = DoclingDocumentParser()
79
+
80
+ return {
81
+ DocumentType.TXT: _default_parser,
82
+ DocumentType.MD: _default_parser,
83
+ DocumentType.PDF: _default_parser,
84
+ DocumentType.DOCX: _default_parser,
85
+ DocumentType.PPTX: PptxDocumentParser(),
86
+ DocumentType.XLSX: _default_parser,
87
+ DocumentType.HTML: _default_parser,
88
+ DocumentType.JPG: _default_parser,
89
+ DocumentType.PNG: _default_parser,
90
+ }
@@ -0,0 +1,248 @@
1
+ import base64
2
+ import inspect
3
+ import os
4
+ from io import BytesIO
5
+
6
+ from PIL import Image
7
+ from typing_extensions import Self
8
+
9
+ try:
10
+ from unstructured import utils
11
+ finally:
12
+ # Unstructured does super slow call to scarf analytics, including checking nvidia-smi,
13
+ # which adds couple of seconds of importing time.
14
+ # This is a hack to disable it.
15
+ utils.scarf_analytics = lambda *args: True
16
+
17
+ from unstructured.chunking.basic import chunk_elements
18
+ from unstructured.documents.elements import Element as UnstructuredElement
19
+ from unstructured.documents.elements import ElementType
20
+ from unstructured.partition.auto import partition
21
+ from unstructured.staging.base import elements_from_dicts
22
+ from unstructured_client import UnstructuredClient
23
+ from unstructured_client.models.operations import PartitionRequestTypedDict
24
+ from unstructured_client.models.shared import FilesTypedDict, PartitionParametersTypedDict, Strategy
25
+
26
+ from ragbits.core.audit.traces import traceable
27
+ from ragbits.document_search.documents.document import Document, DocumentType
28
+ from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
29
+ from ragbits.document_search.ingestion.parsers.base import DocumentParser
30
+
31
+ UNSTRUCTURED_API_KEY_ENV = "UNSTRUCTURED_API_KEY"
32
+ UNSTRUCTURED_SERVER_URL_ENV = "UNSTRUCTURED_SERVER_URL"
33
+
34
+
35
+ class UnstructuredDocumentParser(DocumentParser):
36
+ """
37
+ Parser that uses the Unstructured API or local SDK to process the documents.
38
+ """
39
+
40
+ supported_document_types = {
41
+ DocumentType.TXT,
42
+ DocumentType.MD,
43
+ DocumentType.PDF,
44
+ DocumentType.DOCX,
45
+ DocumentType.DOC,
46
+ DocumentType.PPTX,
47
+ DocumentType.PPT,
48
+ DocumentType.XLSX,
49
+ DocumentType.XLS,
50
+ DocumentType.CSV,
51
+ DocumentType.HTML,
52
+ DocumentType.EPUB,
53
+ DocumentType.ORG,
54
+ DocumentType.ODT,
55
+ DocumentType.RST,
56
+ DocumentType.RTF,
57
+ DocumentType.TSV,
58
+ DocumentType.JSON,
59
+ DocumentType.XML,
60
+ DocumentType.JPG,
61
+ DocumentType.PNG,
62
+ }
63
+
64
+ def __init__(
65
+ self,
66
+ partition_kwargs: dict | None = None,
67
+ chunking_kwargs: dict | None = None,
68
+ api_key: str | None = None,
69
+ api_server: str | None = None,
70
+ use_api: bool = False,
71
+ ignore_images: bool = False,
72
+ ) -> None:
73
+ """
74
+ Initialize the UnstructuredDocumentParser instance.
75
+
76
+ Args:
77
+ partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
78
+ for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
79
+ chunking_kwargs: The additional arguments for the chunking.
80
+ api_key: The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment
81
+ variable will be used.
82
+ api_server: The API server URL to use for the Unstructured API. If not specified, the
83
+ UNSTRUCTURED_SERVER_URL environment variable will be used.
84
+ use_api: whether to use Unstructured API, otherwise use local version of Unstructured library
85
+ ignore_images: if True images will be skipped
86
+ """
87
+ self.partition_kwargs = partition_kwargs or {}
88
+ self.chunking_kwargs = chunking_kwargs or {}
89
+ self.api_key = api_key or os.getenv(UNSTRUCTURED_API_KEY_ENV)
90
+ self.api_server = api_server or os.getenv(UNSTRUCTURED_SERVER_URL_ENV)
91
+ self.use_api = use_api
92
+ self.ignore_images = ignore_images
93
+ self._client = UnstructuredClient(api_key_auth=self.api_key, server_url=self.api_server)
94
+
95
+ def __reduce__(self) -> tuple[type[Self], tuple]:
96
+ """
97
+ Enables the UnstructuredDocumentParser to be pickled and unpickled.
98
+
99
+ Returns:
100
+ The tuple of class and its arguments that allows object reconstruction.
101
+ """
102
+ return self.__class__, tuple(
103
+ self.__getattribute__(param_name)
104
+ for param_name in list(inspect.signature(self.__class__.__init__).parameters)[1:]
105
+ )
106
+
107
+ @traceable
108
+ async def parse(self, document: Document) -> list[Element]:
109
+ """
110
+ Parse the document using the Unstructured API.
111
+
112
+ Args:
113
+ document: The document to parse.
114
+
115
+ Returns:
116
+ The list of elements extracted from the document.
117
+
118
+ Raises:
119
+ ParserDocumentNotSupportedError: If the document type is not supported by the parser.
120
+ """
121
+ self.validate_document_type(document.metadata.document_type)
122
+ elements = await self._partition(document)
123
+ return self._chunk(elements, document)
124
+
125
+ async def _partition(self, document: Document) -> list[UnstructuredElement]:
126
+ """
127
+ Partition the document.
128
+
129
+ Args:
130
+ document: The document to parse.
131
+
132
+ Returns:
133
+ The list of extracted elements.
134
+ """
135
+ if self.use_api:
136
+ request = PartitionRequestTypedDict(
137
+ partition_parameters=PartitionParametersTypedDict(
138
+ files=FilesTypedDict(
139
+ content=document.local_path.read_bytes(),
140
+ file_name=document.local_path.name,
141
+ ),
142
+ coordinates=True,
143
+ strategy=Strategy.HI_RES,
144
+ languages=["eng"],
145
+ extract_image_block_types=["Image", "Table"],
146
+ split_pdf_allow_failed=True,
147
+ split_pdf_concurrency_level=15,
148
+ split_pdf_page=True,
149
+ include_orig_elements=True,
150
+ ),
151
+ )
152
+ request["partition_parameters"].update(**self.partition_kwargs) # type: ignore
153
+ response = await self._client.general.partition_async(request=request)
154
+ return elements_from_dicts(response.elements) if response.elements else []
155
+
156
+ return partition(
157
+ filename=str(document.local_path),
158
+ metadata_filename=document.local_path.name,
159
+ extract_image_block_types=["Image", "Table"],
160
+ extract_image_block_to_payload=True,
161
+ include_orig_elements=True,
162
+ **self.partition_kwargs,
163
+ )
164
+
165
+ def _chunk(self, elements: list[UnstructuredElement], document: Document) -> list[Element]:
166
+ """
167
+ Chunk the list of elements.
168
+
169
+ Args:
170
+ elements: The list of unstructured elements.
171
+ document: The document to parse.
172
+
173
+ Returns:
174
+ The list of chunked elements.
175
+ """
176
+ nonimage_elements = [element for element in elements if element.category != ElementType.IMAGE]
177
+
178
+ text_elements: list[Element] = [
179
+ TextElement(
180
+ document_meta=document.metadata,
181
+ location=self._extract_element_location(element),
182
+ content=element.text,
183
+ )
184
+ for element in chunk_elements(nonimage_elements, **self.chunking_kwargs)
185
+ ]
186
+
187
+ if self.ignore_images:
188
+ return text_elements
189
+
190
+ return text_elements + [
191
+ ImageElement(
192
+ document_meta=document.metadata,
193
+ location=self._extract_element_location(element),
194
+ image_bytes=self._extract_image_element_bytes(element, document),
195
+ ocr_extracted_text=element.text,
196
+ )
197
+ for element in elements
198
+ if element.category == ElementType.IMAGE
199
+ ]
200
+
201
+ @staticmethod
202
+ def _extract_element_location(element: UnstructuredElement) -> ElementLocation:
203
+ """
204
+ Convert unstructured element to element location.
205
+
206
+ Args:
207
+ element: The element from unstructured.
208
+
209
+ Returns:
210
+ The element location.
211
+ """
212
+ metadata = element.metadata.to_dict()
213
+ return ElementLocation(
214
+ page_number=metadata.get("page_number"),
215
+ coordinates=metadata.get("coordinates"),
216
+ )
217
+
218
+ @staticmethod
219
+ def _extract_image_element_bytes(element: UnstructuredElement, document: Document) -> bytes:
220
+ """
221
+ Extract image data using alternative methods when element.metadata.image_base64 is empty.
222
+
223
+ This handles cases where the Unstructured doesn't properly extract image data,
224
+ requiring additional processing.
225
+
226
+ Args:
227
+ element: The Unstructured image element.
228
+ document: The Document to parse.
229
+
230
+ Return:
231
+ The raw image data.
232
+ """
233
+ if element.metadata.image_base64:
234
+ return base64.b64decode(element.metadata.image_base64)
235
+
236
+ if element.metadata.coordinates and element.metadata.coordinates.points:
237
+ buffered = BytesIO()
238
+ Image.open(document.local_path).convert("RGB").crop(
239
+ (
240
+ min(element.metadata.coordinates.points[0][0], element.metadata.coordinates.points[1][0]),
241
+ min(element.metadata.coordinates.points[0][1], element.metadata.coordinates.points[3][1]),
242
+ max(element.metadata.coordinates.points[2][0], element.metadata.coordinates.points[3][0]),
243
+ max(element.metadata.coordinates.points[1][1], element.metadata.coordinates.points[2][1]),
244
+ )
245
+ ).save(buffered, format="JPEG")
246
+ return buffered.getvalue()
247
+
248
+ return b""
@@ -0,0 +1,6 @@
1
+ from ragbits.document_search.ingestion.strategies.base import IngestStrategy
2
+ from ragbits.document_search.ingestion.strategies.batched import BatchedIngestStrategy
3
+ from ragbits.document_search.ingestion.strategies.ray import RayDistributedIngestStrategy
4
+ from ragbits.document_search.ingestion.strategies.sequential import SequentialIngestStrategy
5
+
6
+ __all__ = ["BatchedIngestStrategy", "IngestStrategy", "RayDistributedIngestStrategy", "SequentialIngestStrategy"]
@@ -0,0 +1,290 @@
1
+ import asyncio
2
+ import logging
3
+ import random
4
+ import traceback
5
+ from abc import ABC, abstractmethod
6
+ from collections import defaultdict
7
+ from collections.abc import Awaitable, Callable, Iterable
8
+ from dataclasses import dataclass, field
9
+ from types import ModuleType
10
+ from typing import ClassVar, ParamSpec, TypeVar
11
+
12
+ from ragbits.core.sources.base import Source
13
+ from ragbits.core.utils.config_handling import WithConstructionConfig
14
+ from ragbits.core.vector_stores.base import VectorStore
15
+ from ragbits.document_search.documents.document import Document, DocumentMeta
16
+ from ragbits.document_search.documents.element import Element
17
+ from ragbits.document_search.ingestion import strategies
18
+ from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
19
+ from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ _CallP = ParamSpec("_CallP")
24
+ _CallReturnT = TypeVar("_CallReturnT")
25
+
26
+
27
+ @dataclass
28
+ class IngestError:
29
+ """
30
+ Represents an error that occurred during the document ingest execution
31
+ """
32
+
33
+ type: type[Exception]
34
+ message: str
35
+ stacktrace: str
36
+
37
+ @classmethod
38
+ def from_exception(cls, exc: Exception) -> "IngestError":
39
+ """
40
+ Create an IngestError from an exception.
41
+
42
+ Args:
43
+ exc: The exception to create the IngestError from.
44
+
45
+ Returns:
46
+ The IngestError instance.
47
+ """
48
+ stacktrace = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
49
+ return cls(type=type(exc), message=str(exc), stacktrace=stacktrace)
50
+
51
+
52
+ @dataclass
53
+ class IngestDocumentResult:
54
+ """
55
+ Represents the result of the document ingest execution.
56
+ """
57
+
58
+ document_uri: str
59
+ num_elements: int = 0
60
+ error: IngestError | None = None
61
+
62
+
63
+ @dataclass
64
+ class IngestExecutionResult:
65
+ """
66
+ Represents the result of the documents ingest execution.
67
+ """
68
+
69
+ successful: list[IngestDocumentResult] = field(default_factory=list)
70
+ failed: list[IngestDocumentResult] = field(default_factory=list)
71
+
72
+
73
+ class IngestExecutionError(Exception):
74
+ """
75
+ Represents an error that occurred during the documents ingest execution.
76
+ """
77
+
78
+ def __init__(self, results: list[IngestDocumentResult]) -> None:
79
+ self.results = results
80
+
81
+
82
+ class IngestStrategy(WithConstructionConfig, ABC):
83
+ """
84
+ Base class for ingest strategies, responsible for orchiesting the tasks required to index the document.
85
+ """
86
+
87
+ default_module: ClassVar[ModuleType | None] = strategies
88
+ configuration_key: ClassVar[str] = "ingest_strategy"
89
+
90
+ def __init__(self, num_retries: int = 3, backoff_multiplier: int = 1, backoff_max: int = 60) -> None:
91
+ """
92
+ Initialize the IngestStrategy instance.
93
+
94
+ Args:
95
+ num_retries: The number of retries per document ingest task error.
96
+ backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
97
+ backoff_max: The maximum allowed delay (in seconds) between retries.
98
+ """
99
+ self.num_retries = num_retries
100
+ self.backoff_multiplier = backoff_multiplier
101
+ self.backoff_max = backoff_max
102
+
103
+ @abstractmethod
104
+ async def __call__(
105
+ self,
106
+ documents: Iterable[DocumentMeta | Document | Source],
107
+ vector_store: VectorStore,
108
+ parser_router: DocumentParserRouter,
109
+ enricher_router: ElementEnricherRouter,
110
+ ) -> IngestExecutionResult:
111
+ """
112
+ Ingest documents.
113
+
114
+ Args:
115
+ documents: The documents to ingest.
116
+ vector_store: The vector store to store document chunks.
117
+ parser_router: The document parser router to use.
118
+ enricher_router: The intermediate element enricher router to use.
119
+
120
+ Returns:
121
+ The ingest execution result.
122
+ """
123
+
124
+ async def _call_with_error_handling(
125
+ self,
126
+ executable: Callable[_CallP, Awaitable[_CallReturnT]],
127
+ *executable_args: _CallP.args,
128
+ **executable_kwargs: _CallP.kwargs,
129
+ ) -> _CallReturnT:
130
+ """
131
+ Call executable with a standarized error handling.
132
+ If an error occurs, the executable is retried `num_retries` times using randomized exponential backoff.
133
+
134
+ Args:
135
+ executable: The callable function to execute.
136
+ executable_args: Positional arguments to pass to the executable.
137
+ executable_kwargs: Keyword arguments to pass to the executable.
138
+
139
+ Returns:
140
+ The result of the executable if successful.
141
+
142
+ Raises:
143
+ Exception: The last encountered exception after all retries are exhausted.
144
+ """
145
+ for i in range(max(0, self.num_retries) + 1):
146
+ try:
147
+ return await executable(*executable_args, **executable_kwargs)
148
+ except Exception as exc:
149
+ if i == self.num_retries:
150
+ raise exc
151
+
152
+ delay = min(2**i * self.backoff_multiplier, self.backoff_max)
153
+ delay = random.uniform(0, delay) if delay < self.backoff_max else random.uniform(0, self.backoff_max) # noqa S311
154
+ await asyncio.sleep(delay)
155
+
156
+ raise RuntimeError("Unreachable code reached") # mypy quirk
157
+
158
+ @staticmethod
159
+ async def _parse_document(
160
+ document: DocumentMeta | Document | Source,
161
+ parser_router: DocumentParserRouter,
162
+ ) -> list[Element]:
163
+ """
164
+ Parse a single document and return the elements.
165
+
166
+ Args:
167
+ document: The document to parse.
168
+ parser_router: The document parser router to use.
169
+
170
+ Returns:
171
+ The list of elements.
172
+
173
+ Raises:
174
+ ParserError: If the parsing of the document failed.
175
+ ParserDocumentNotSupportedError: If the document type is not supported.
176
+ ParserNotFoundError: If no parser is found for the document type.
177
+ SourceError: If the download of the document failed.
178
+ """
179
+ document_meta = (
180
+ await DocumentMeta.from_source(document)
181
+ if isinstance(document, Source)
182
+ else document
183
+ if isinstance(document, DocumentMeta)
184
+ else document.metadata
185
+ )
186
+
187
+ parser = parser_router.get(document_meta.document_type)
188
+ parser.validate_document_type(document_meta.document_type)
189
+ document = await document_meta.fetch()
190
+
191
+ return await parser.parse(document)
192
+
193
+ @staticmethod
194
+ async def _enrich_elements(
195
+ elements: Iterable[Element],
196
+ enricher_router: ElementEnricherRouter,
197
+ ) -> list[Element]:
198
+ """
199
+ Enrich elements for a single document.
200
+
201
+ Args:
202
+ elements: The document elements to enrich.
203
+ enricher_router: The element enricher router to use.
204
+
205
+ Returns:
206
+ The list of enriched elements.
207
+
208
+ Raises:
209
+ EnricherError: If the enrichment of the elements failed.
210
+ EnricherElementNotSupportedError: If the element type is not supported.
211
+ """
212
+ grouped_elements = defaultdict(list)
213
+ for element in elements:
214
+ grouped_elements[type(element)].append(element)
215
+
216
+ # Separate elements that have enrichers from those that don't
217
+ elements_to_enrich = []
218
+ elements_without_enrichers = []
219
+
220
+ for element_type, elements_of_type in grouped_elements.items():
221
+ if element_type in enricher_router:
222
+ enricher = enricher_router.get(element_type)
223
+ enricher.validate_element_type(element_type)
224
+ elements_to_enrich.append((element_type, elements_of_type))
225
+ else:
226
+ # No enricher found for this element type, keep elements as-is
227
+ elements_without_enrichers.extend(elements_of_type)
228
+
229
+ # Enrich elements that have enrichers
230
+ if elements_to_enrich:
231
+ grouped_enriched_elements = await asyncio.gather(
232
+ *[
233
+ enricher_router.get(element_type).enrich(elements_of_type)
234
+ for element_type, elements_of_type in elements_to_enrich
235
+ ]
236
+ )
237
+ enriched_elements = [element for enriched_group in grouped_enriched_elements for element in enriched_group]
238
+ else:
239
+ enriched_elements = []
240
+
241
+ # Combine enriched elements with elements that don't need enrichment
242
+ return enriched_elements + elements_without_enrichers
243
+
244
+ @staticmethod
245
+ async def _remove_elements(document_ids: list[str], vector_store: VectorStore) -> None:
246
+ """
247
+ Remove documents entries from the vector store.
248
+
249
+ Args:
250
+ document_ids: The list of document ids to remove from the vector store.
251
+ vector_store: The vector store to remove document elements from.
252
+ """
253
+ # TODO: Pass 'where' argument to the list method to filter results and optimize search
254
+ ids_to_delete = [
255
+ entry.id
256
+ for entry in await vector_store.list()
257
+ if entry.metadata.get("document_meta", {}).get("source", {}).get("id") in document_ids
258
+ ]
259
+ if ids_to_delete:
260
+ await vector_store.remove(ids_to_delete)
261
+
262
+ @staticmethod
263
+ async def _insert_elements(elements: Iterable[Element], vector_store: VectorStore) -> None:
264
+ """
265
+ Insert elements into the vector store.
266
+
267
+ Args:
268
+ elements: The list of elements to insert.
269
+ vector_store: The vector store to store document chunks.
270
+ """
271
+ entries = [element.to_vector_db_entry() for element in elements]
272
+
273
+ # Deduplicate entries by their unique ID to prevent duplicate key errors in the
274
+ # underlying vector store implementation (many vector stores require IDs to be
275
+ # unique and will raise an error if duplicates are provided).
276
+ unique_entries: dict = {}
277
+ for entry in entries:
278
+ # If the ID is already present we skip the duplicate and log a warning.
279
+ # This behaviour ensures idempotency of the ingest operation while
280
+ # still indexing the first occurrence of every element
281
+ if entry.id not in unique_entries:
282
+ unique_entries[entry.id] = entry
283
+ else:
284
+ logger.warning(
285
+ f"Skipping duplicate entry: {entry.id} from document "
286
+ f"{entry.metadata.get('document_meta', {}).get('source', {}).get('id')}"
287
+ )
288
+
289
+ if unique_entries:
290
+ await vector_store.store(list(unique_entries.values()))