ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/document_search/__init__.py +3 -0
- ragbits/document_search/_main.py +273 -0
- ragbits/document_search/cli.py +109 -0
- ragbits/document_search/documents/__init__.py +0 -0
- ragbits/document_search/documents/document.py +203 -0
- ragbits/document_search/documents/element.py +208 -0
- ragbits/document_search/ingestion/__init__.py +0 -0
- ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
- ragbits/document_search/ingestion/enrichers/base.py +64 -0
- ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
- ragbits/document_search/ingestion/enrichers/image.py +107 -0
- ragbits/document_search/ingestion/enrichers/router.py +86 -0
- ragbits/document_search/ingestion/parsers/__init__.py +9 -0
- ragbits/document_search/ingestion/parsers/base.py +97 -0
- ragbits/document_search/ingestion/parsers/docling.py +178 -0
- ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
- ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
- ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
- ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
- ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
- ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
- ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
- ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
- ragbits/document_search/ingestion/parsers/router.py +90 -0
- ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
- ragbits/document_search/ingestion/strategies/__init__.py +6 -0
- ragbits/document_search/ingestion/strategies/base.py +290 -0
- ragbits/document_search/ingestion/strategies/batched.py +261 -0
- ragbits/document_search/ingestion/strategies/ray.py +138 -0
- ragbits/document_search/ingestion/strategies/sequential.py +23 -0
- ragbits/document_search/py.typed +0 -0
- ragbits/document_search/retrieval/__init__.py +0 -0
- ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
- ragbits/document_search/retrieval/rephrasers/base.py +39 -0
- ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
- ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
- ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
- ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
- ragbits/document_search/retrieval/rerankers/base.py +56 -0
- ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
- ragbits/document_search/retrieval/rerankers/llm.py +177 -0
- ragbits/document_search/retrieval/rerankers/noop.py +34 -0
- ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import uuid
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, computed_field
|
|
7
|
+
|
|
8
|
+
from ragbits.core.utils.pydantic import SerializableBytes
|
|
9
|
+
from ragbits.core.vector_stores.base import VectorStoreEntry
|
|
10
|
+
from ragbits.document_search.documents.document import DocumentMeta
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ElementLocation(BaseModel):
|
|
14
|
+
"""
|
|
15
|
+
An object representing position of chunk within document.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
page_number: int | None = None
|
|
19
|
+
coordinates: dict | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Element(BaseModel, ABC):
|
|
23
|
+
"""
|
|
24
|
+
An object representing an element in a document.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
element_type: str
|
|
28
|
+
document_meta: DocumentMeta
|
|
29
|
+
location: ElementLocation | None = None
|
|
30
|
+
score: float | None = None
|
|
31
|
+
|
|
32
|
+
_elements_registry: ClassVar[dict[str, type["Element"]]] = {}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def id(self) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Retrieve the ID of the element, primarily used to represent the element's data.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
str: string representing element
|
|
41
|
+
"""
|
|
42
|
+
id_components = self.get_id_components()
|
|
43
|
+
return "&".join(f"{k}={v}" for k, v in id_components.items())
|
|
44
|
+
|
|
45
|
+
def get_id_components(self) -> dict[str, str]:
|
|
46
|
+
"""
|
|
47
|
+
Creates a dictionary of key value pairs of id components
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
dict: a dictionary
|
|
51
|
+
"""
|
|
52
|
+
id_components = {
|
|
53
|
+
"meta": self.document_meta.id,
|
|
54
|
+
"type": self.element_type,
|
|
55
|
+
"key": str(self.key),
|
|
56
|
+
"text": str(self.text_representation),
|
|
57
|
+
"location": str(self.location),
|
|
58
|
+
}
|
|
59
|
+
return id_components
|
|
60
|
+
|
|
61
|
+
@computed_field # type: ignore[prop-decorator]
|
|
62
|
+
@property
|
|
63
|
+
def key(self) -> str | None:
|
|
64
|
+
"""
|
|
65
|
+
Get the representation of the element for embedding.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The representation for embedding.
|
|
69
|
+
"""
|
|
70
|
+
return self.text_representation
|
|
71
|
+
|
|
72
|
+
@computed_field # type: ignore[prop-decorator]
|
|
73
|
+
@property
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def text_representation(self) -> str | None:
|
|
76
|
+
"""
|
|
77
|
+
Get the text representation of the element.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The text representation.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def image_representation(self) -> bytes | None:
|
|
85
|
+
"""
|
|
86
|
+
Get the image representation of the element.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The image representation.
|
|
90
|
+
"""
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def __pydantic_init_subclass__(cls, **kwargs: Any) -> None: # noqa: ANN401
|
|
95
|
+
element_type_default = cls.model_fields["element_type"].default
|
|
96
|
+
if element_type_default is None:
|
|
97
|
+
raise ValueError("Element type must be defined")
|
|
98
|
+
Element._elements_registry[element_type_default] = cls
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_vector_db_entry(cls, db_entry: VectorStoreEntry, score: float | None = None) -> "Element":
|
|
102
|
+
"""
|
|
103
|
+
Create an element from a vector database entry.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
db_entry: The vector database entry.
|
|
107
|
+
score: The score of the element retrieved from the vector database or reranker.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
The element.
|
|
111
|
+
"""
|
|
112
|
+
element_type = db_entry.metadata["element_type"]
|
|
113
|
+
element_cls = Element._elements_registry[element_type]
|
|
114
|
+
if "embedding_type" in db_entry.metadata:
|
|
115
|
+
del db_entry.metadata["embedding_type"]
|
|
116
|
+
|
|
117
|
+
element = element_cls(**db_entry.metadata)
|
|
118
|
+
element.score = score
|
|
119
|
+
return element
|
|
120
|
+
|
|
121
|
+
def to_vector_db_entry(self) -> VectorStoreEntry:
|
|
122
|
+
"""
|
|
123
|
+
Create a vector database entry from the element.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
The vector database entry
|
|
127
|
+
"""
|
|
128
|
+
id_components = [
|
|
129
|
+
self.id,
|
|
130
|
+
]
|
|
131
|
+
vector_store_entry_id = uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components))
|
|
132
|
+
metadata = self.model_dump(exclude={"id", "key"})
|
|
133
|
+
metadata["document_meta"]["source"]["id"] = self.document_meta.source.id
|
|
134
|
+
|
|
135
|
+
return VectorStoreEntry(
|
|
136
|
+
id=vector_store_entry_id, text=self.key, image_bytes=self.image_representation, metadata=metadata
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class TextElement(Element):
|
|
141
|
+
"""
|
|
142
|
+
An object representing a text element in a document.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
element_type: str = "text"
|
|
146
|
+
content: str
|
|
147
|
+
|
|
148
|
+
@computed_field # type: ignore[prop-decorator]
|
|
149
|
+
@property
|
|
150
|
+
def text_representation(self) -> str:
|
|
151
|
+
"""
|
|
152
|
+
Get the text representation of the element.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
The text representation.
|
|
156
|
+
"""
|
|
157
|
+
return self.content
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class ImageElement(Element):
|
|
161
|
+
"""
|
|
162
|
+
An object representing an image element in a document.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
element_type: str = "image"
|
|
166
|
+
image_bytes: SerializableBytes
|
|
167
|
+
description: str | None = None
|
|
168
|
+
ocr_extracted_text: str | None = None
|
|
169
|
+
|
|
170
|
+
@computed_field # type: ignore[prop-decorator]
|
|
171
|
+
@property
|
|
172
|
+
def text_representation(self) -> str | None:
|
|
173
|
+
"""
|
|
174
|
+
Get the text representation of the element.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
The text representation.
|
|
178
|
+
"""
|
|
179
|
+
if not self.description and not self.ocr_extracted_text:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
repr = ""
|
|
183
|
+
if self.description:
|
|
184
|
+
repr += f"Description: {self.description}\n"
|
|
185
|
+
if self.ocr_extracted_text:
|
|
186
|
+
repr += f"Extracted text: {self.ocr_extracted_text}"
|
|
187
|
+
return repr
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def image_representation(self) -> bytes:
|
|
191
|
+
"""
|
|
192
|
+
Get the image representation of the element.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
The image representation.
|
|
196
|
+
"""
|
|
197
|
+
return self.image_bytes
|
|
198
|
+
|
|
199
|
+
def get_id_components(self) -> dict[str, str]:
|
|
200
|
+
"""
|
|
201
|
+
Creates a dictionary of key value pairs of id components
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
dict: a dictionary
|
|
205
|
+
"""
|
|
206
|
+
id_components = super().get_id_components()
|
|
207
|
+
id_components["image_hash"] = hashlib.sha256(self.image_bytes).hexdigest()
|
|
208
|
+
return id_components
|
|
File without changes
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from ragbits.document_search.ingestion.enrichers.base import ElementEnricher
|
|
2
|
+
from ragbits.document_search.ingestion.enrichers.image import ImageElementEnricher
|
|
3
|
+
from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
|
|
4
|
+
|
|
5
|
+
__all__ = ["ElementEnricher", "ElementEnricherRouter", "ImageElementEnricher"]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from types import ModuleType, UnionType
|
|
3
|
+
from typing import ClassVar, Generic, TypeVar, get_args, get_origin
|
|
4
|
+
|
|
5
|
+
from ragbits.core.utils.config_handling import WithConstructionConfig
|
|
6
|
+
from ragbits.document_search.documents.element import Element
|
|
7
|
+
from ragbits.document_search.ingestion import enrichers
|
|
8
|
+
from ragbits.document_search.ingestion.enrichers.exceptions import EnricherElementNotSupportedError
|
|
9
|
+
|
|
10
|
+
ElementT = TypeVar("ElementT", bound=Element)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ElementEnricher(Generic[ElementT], WithConstructionConfig, ABC):
|
|
14
|
+
"""
|
|
15
|
+
Base class for element enrichers, responsible for providing additional information about elements.
|
|
16
|
+
|
|
17
|
+
Enrichers operate on raw elements and are used to fill in missing fields that could not be filled in during parsing.
|
|
18
|
+
They usually deal with summarizing text or describing images.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
default_module: ClassVar[ModuleType | None] = enrichers
|
|
22
|
+
configuration_key: ClassVar[str] = "enricher"
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
async def enrich(self, elements: list[ElementT]) -> list[ElementT]:
|
|
26
|
+
"""
|
|
27
|
+
Enrich elements.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
elements: The elements to be enriched.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The list of enriched elements.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
EnricherError: If the enrichment of the elements failed.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def validate_element_type(cls, element_type: type[Element]) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Check if the enricher supports the element type.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
element_type: The element type to validate against the enricher.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
EnricherElementNotSupportedError: If the element type is not supported.
|
|
49
|
+
"""
|
|
50
|
+
expected_element_type = cls.__orig_bases__[0].__args__[0] # type: ignore
|
|
51
|
+
|
|
52
|
+
# Check if expected_element_type is a Union and if element_type is in that Union
|
|
53
|
+
if (
|
|
54
|
+
(origin := get_origin(expected_element_type))
|
|
55
|
+
and origin == UnionType
|
|
56
|
+
and element_type in get_args(expected_element_type)
|
|
57
|
+
):
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
# Check if element_type matches expected_element_type exactly
|
|
61
|
+
if element_type == expected_element_type:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
raise EnricherElementNotSupportedError(enricher_name=cls.__name__, element_type=element_type)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from ragbits.document_search.documents.element import Element
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EnricherError(Exception):
|
|
5
|
+
"""
|
|
6
|
+
Class for all exceptions raised by the element enricher and router.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, message: str) -> None:
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
self.message = message
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EnricherNotFoundError(EnricherError):
|
|
15
|
+
"""
|
|
16
|
+
Raised when no enricher was found for the element type.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, element_type: type[Element]) -> None:
|
|
20
|
+
super().__init__(f"No enricher found for the element type {element_type}")
|
|
21
|
+
self.element_type = element_type
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EnricherElementNotSupportedError(EnricherError):
|
|
25
|
+
"""
|
|
26
|
+
Raised when the element type is not supported by the enricher.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, enricher_name: str, element_type: type[Element]) -> None:
|
|
30
|
+
super().__init__(f"Element type {element_type} is not supported by the {enricher_name}")
|
|
31
|
+
self.enricher_name = enricher_name
|
|
32
|
+
self.element_type = element_type
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
from ragbits.core.llms.base import LLM, LLMType
|
|
4
|
+
from ragbits.core.llms.factory import get_preferred_llm
|
|
5
|
+
from ragbits.core.prompt import Attachment, Prompt
|
|
6
|
+
from ragbits.core.utils.config_handling import ObjectConstructionConfig, import_by_path
|
|
7
|
+
from ragbits.document_search.documents.element import ImageElement
|
|
8
|
+
from ragbits.document_search.ingestion.enrichers.base import ElementEnricher
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ImageDescriberInput(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
Input data for an image describer prompt.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
image: Attachment
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ImageDescriberOutput(BaseModel):
|
|
20
|
+
"""
|
|
21
|
+
Output data for an image describer prompt.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
description: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ImageDescriberPrompt(Prompt[ImageDescriberInput, ImageDescriberOutput]):
|
|
28
|
+
"""
|
|
29
|
+
Prompt for describing image elements using LLM.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
user_prompt = "Describe the content of the image."
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ImageElementEnricher(ElementEnricher[ImageElement]):
|
|
36
|
+
"""
|
|
37
|
+
Enricher that describes image elements using LLM.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
llm: LLM | None = None,
|
|
43
|
+
prompt: type[Prompt[ImageDescriberInput, ImageDescriberOutput]] | None = None,
|
|
44
|
+
) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Initialize the ImageElementEnricher instance.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
llm: The language model to use for describing images.
|
|
50
|
+
prompt: The prompt class to use.
|
|
51
|
+
"""
|
|
52
|
+
self._llm = llm or get_preferred_llm(llm_type=LLMType.VISION)
|
|
53
|
+
self._prompt = prompt or ImageDescriberPrompt
|
|
54
|
+
|
|
55
|
+
async def enrich(self, elements: list[ImageElement]) -> list[ImageElement]:
|
|
56
|
+
"""
|
|
57
|
+
Enrich image elements with additional description of the image.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
elements: The elements to be enriched.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The list of enriched elements.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
EnricherElementNotSupportedError: If the element type is not supported.
|
|
67
|
+
LLMError: If LLM generation fails.
|
|
68
|
+
"""
|
|
69
|
+
responses: list[ImageDescriberOutput] = []
|
|
70
|
+
for element in elements:
|
|
71
|
+
self.validate_element_type(type(element))
|
|
72
|
+
image = Attachment(data=element.image_bytes)
|
|
73
|
+
prompt = self._prompt(ImageDescriberInput(image=image))
|
|
74
|
+
responses.append(await self._llm.generate(prompt))
|
|
75
|
+
|
|
76
|
+
return [
|
|
77
|
+
ImageElement(
|
|
78
|
+
document_meta=element.document_meta,
|
|
79
|
+
description=response.description,
|
|
80
|
+
image_bytes=element.image_bytes,
|
|
81
|
+
ocr_extracted_text=element.ocr_extracted_text,
|
|
82
|
+
)
|
|
83
|
+
for element, response in zip(elements, responses, strict=True)
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def from_config(cls, config: dict) -> "ImageElementEnricher":
|
|
88
|
+
"""
|
|
89
|
+
Create an `ImageElementEnricher` instance from a configuration dictionary.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
config: The dictionary containing the configuration settings.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
The initialized instance of `ImageElementEnricher`.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValidationError: If the configuration doesn't follow the expected format.
|
|
99
|
+
InvalidConfigError: If llm or prompt can't be found or are not the correct type.
|
|
100
|
+
"""
|
|
101
|
+
config["llm"] = (
|
|
102
|
+
LLM.subclass_from_config(ObjectConstructionConfig.model_validate(config["llm"]))
|
|
103
|
+
if "llm" in config
|
|
104
|
+
else None
|
|
105
|
+
)
|
|
106
|
+
config["prompt"] = import_by_path(config["prompt"]) if "prompt" in config else None
|
|
107
|
+
return super().from_config(config)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
from typing import ClassVar
|
|
3
|
+
|
|
4
|
+
from typing_extensions import Self
|
|
5
|
+
|
|
6
|
+
from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig, import_by_path
|
|
7
|
+
from ragbits.document_search.documents import element
|
|
8
|
+
from ragbits.document_search.documents.element import Element
|
|
9
|
+
from ragbits.document_search.ingestion.enrichers.base import ElementEnricher
|
|
10
|
+
from ragbits.document_search.ingestion.enrichers.exceptions import EnricherNotFoundError
|
|
11
|
+
|
|
12
|
+
_DEFAULT_ENRICHERS: dict[type[Element], ElementEnricher] = {}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ElementEnricherRouter(WithConstructionConfig):
|
|
16
|
+
"""
|
|
17
|
+
The class responsible for routing the element to the correct enricher based on the element type.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
configuration_key: ClassVar[str] = "enricher_router"
|
|
21
|
+
|
|
22
|
+
_enrichers: Mapping[type[Element], ElementEnricher]
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
enrichers: Mapping[type[Element], ElementEnricher] | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Initialize the ElementEnricherRouter instance.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
enrichers: The mapping of element types and their enrichers. To override default enrichers.
|
|
33
|
+
"""
|
|
34
|
+
self._enrichers = {**_DEFAULT_ENRICHERS, **enrichers} if enrichers else _DEFAULT_ENRICHERS
|
|
35
|
+
|
|
36
|
+
def __contains__(self, element_type: type[Element]) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Check if there is an enricher defined of the given element type.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
element_type: The element type.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
True if the enricher is defined for the element, otherwise False.
|
|
45
|
+
"""
|
|
46
|
+
return element_type in self._enrichers
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def from_config(cls, config: dict[str, ObjectConstructionConfig]) -> Self:
|
|
50
|
+
"""
|
|
51
|
+
Initialize the class with the provided configuration.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
config: A dictionary containing configuration details for the class.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
The ElementEnricherRouter.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
InvalidConfigError: If any of the provided parsers cannot be initialized.
|
|
61
|
+
"""
|
|
62
|
+
enrichers: dict[type[Element], ElementEnricher] = {
|
|
63
|
+
import_by_path(element_type, element): ElementEnricher.subclass_from_config(enricher_config)
|
|
64
|
+
for element_type, enricher_config in config.items()
|
|
65
|
+
}
|
|
66
|
+
return super().from_config({"enrichers": enrichers})
|
|
67
|
+
|
|
68
|
+
def get(self, element_type: type[Element]) -> ElementEnricher:
|
|
69
|
+
"""
|
|
70
|
+
Get the enricher for the element.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
element_type: The element type.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The enricher for processing the element.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
EnricherNotFoundError: If no enricher is found for the element type.
|
|
80
|
+
"""
|
|
81
|
+
enricher = self._enrichers.get(element_type)
|
|
82
|
+
|
|
83
|
+
if isinstance(enricher, ElementEnricher):
|
|
84
|
+
return enricher
|
|
85
|
+
|
|
86
|
+
raise EnricherNotFoundError(element_type)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
|
|
2
|
+
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"DocumentParser",
|
|
6
|
+
"DocumentParserRouter",
|
|
7
|
+
"ImageDocumentParser",
|
|
8
|
+
"TextDocumentParser",
|
|
9
|
+
]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from types import ModuleType
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
|
|
5
|
+
from ragbits.core.utils.config_handling import WithConstructionConfig
|
|
6
|
+
from ragbits.document_search.documents.document import Document, DocumentType
|
|
7
|
+
from ragbits.document_search.documents.element import Element, ImageElement, TextElement
|
|
8
|
+
from ragbits.document_search.ingestion import parsers
|
|
9
|
+
from ragbits.document_search.ingestion.parsers.exceptions import ParserDocumentNotSupportedError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DocumentParser(WithConstructionConfig, ABC):
|
|
13
|
+
"""
|
|
14
|
+
Base class for document parsers, responsible for converting the document into a list of elements.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
default_module: ClassVar[ModuleType | None] = parsers
|
|
18
|
+
configuration_key: ClassVar[str] = "parser"
|
|
19
|
+
|
|
20
|
+
supported_document_types: set[DocumentType] = set()
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
async def parse(self, document: Document) -> list[Element]:
|
|
24
|
+
"""
|
|
25
|
+
Parse the document.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
document: The document to parse.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
The list of elements extracted from the document.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
ParserError: If the parsing of the document failed.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def validate_document_type(cls, document_type: DocumentType) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Check if the parser supports the document type.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
document_type: The document type to validate against the parser.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ParserDocumentNotSupportedError: If the document type is not supported.
|
|
47
|
+
"""
|
|
48
|
+
if document_type not in cls.supported_document_types:
|
|
49
|
+
raise ParserDocumentNotSupportedError(parser_name=cls.__name__, document_type=document_type)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TextDocumentParser(DocumentParser):
|
|
53
|
+
"""
|
|
54
|
+
Simple parser that maps a text to the text element.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
supported_document_types = {DocumentType.TXT, DocumentType.MD}
|
|
58
|
+
|
|
59
|
+
async def parse(self, document: Document) -> list[Element]:
|
|
60
|
+
"""
|
|
61
|
+
Parse the document.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
document: The document to parse.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List with an text element with the text content.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
ParserDocumentNotSupportedError: If the document type is not supported by the parser.
|
|
71
|
+
"""
|
|
72
|
+
self.validate_document_type(document.metadata.document_type)
|
|
73
|
+
return [TextElement(content=document.local_path.read_text(), document_meta=document.metadata)]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ImageDocumentParser(DocumentParser):
|
|
77
|
+
"""
|
|
78
|
+
Simple parser that maps an image to the image element.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
supported_document_types = {DocumentType.JPG, DocumentType.PNG}
|
|
82
|
+
|
|
83
|
+
async def parse(self, document: Document) -> list[Element]:
|
|
84
|
+
"""
|
|
85
|
+
Parse the document.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
document: The document to parse.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List with an image element with the image content.
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ParserDocumentNotSupportedError: If the document type is not supported by the parser.
|
|
95
|
+
"""
|
|
96
|
+
self.validate_document_type(document.metadata.document_type)
|
|
97
|
+
return [ImageElement(image_bytes=document.local_path.read_bytes(), document_meta=document.metadata)]
|