ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/document_search/__init__.py +3 -0
- ragbits/document_search/_main.py +273 -0
- ragbits/document_search/cli.py +109 -0
- ragbits/document_search/documents/__init__.py +0 -0
- ragbits/document_search/documents/document.py +203 -0
- ragbits/document_search/documents/element.py +208 -0
- ragbits/document_search/ingestion/__init__.py +0 -0
- ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
- ragbits/document_search/ingestion/enrichers/base.py +64 -0
- ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
- ragbits/document_search/ingestion/enrichers/image.py +107 -0
- ragbits/document_search/ingestion/enrichers/router.py +86 -0
- ragbits/document_search/ingestion/parsers/__init__.py +9 -0
- ragbits/document_search/ingestion/parsers/base.py +97 -0
- ragbits/document_search/ingestion/parsers/docling.py +178 -0
- ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
- ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
- ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
- ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
- ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
- ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
- ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
- ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
- ragbits/document_search/ingestion/parsers/router.py +90 -0
- ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
- ragbits/document_search/ingestion/strategies/__init__.py +6 -0
- ragbits/document_search/ingestion/strategies/base.py +290 -0
- ragbits/document_search/ingestion/strategies/batched.py +261 -0
- ragbits/document_search/ingestion/strategies/ray.py +138 -0
- ragbits/document_search/ingestion/strategies/sequential.py +23 -0
- ragbits/document_search/py.typed +0 -0
- ragbits/document_search/retrieval/__init__.py +0 -0
- ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
- ragbits/document_search/retrieval/rephrasers/base.py +39 -0
- ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
- ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
- ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
- ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
- ragbits/document_search/retrieval/rerankers/base.py +56 -0
- ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
- ragbits/document_search/retrieval/rerankers/llm.py +177 -0
- ragbits/document_search/retrieval/rerankers/noop.py +34 -0
- ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
- ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
from collections.abc import Iterable, Sequence
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from types import ModuleType
|
|
4
|
+
from typing import ClassVar, Generic
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
from ragbits import document_search
|
|
10
|
+
from ragbits.core.audit.traces import trace, traceable
|
|
11
|
+
from ragbits.core.config import CoreConfig
|
|
12
|
+
from ragbits.core.options import Options
|
|
13
|
+
from ragbits.core.sources.base import Source, SourceResolver
|
|
14
|
+
from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
15
|
+
from ragbits.core.utils._pyproject import get_config_from_yaml
|
|
16
|
+
from ragbits.core.utils.config_handling import ConfigurableComponent, NoPreferredConfigError, ObjectConstructionConfig
|
|
17
|
+
from ragbits.core.vector_stores.base import VectorStore, VectorStoreOptionsT
|
|
18
|
+
from ragbits.document_search.documents.document import Document, DocumentMeta
|
|
19
|
+
from ragbits.document_search.documents.element import Element
|
|
20
|
+
from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
|
|
21
|
+
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
|
|
22
|
+
from ragbits.document_search.ingestion.strategies.base import (
|
|
23
|
+
IngestExecutionError,
|
|
24
|
+
IngestExecutionResult,
|
|
25
|
+
IngestStrategy,
|
|
26
|
+
)
|
|
27
|
+
from ragbits.document_search.ingestion.strategies.sequential import SequentialIngestStrategy
|
|
28
|
+
from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptionsT
|
|
29
|
+
from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
|
|
30
|
+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptionsT
|
|
31
|
+
from ragbits.document_search.retrieval.rerankers.noop import NoopReranker
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DocumentSearchOptions(Options, Generic[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]):
|
|
35
|
+
"""
|
|
36
|
+
Object representing the options for the document search.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
query_rephraser_options: The options for the query rephraser.
|
|
40
|
+
vector_store_options: The options for the vector store.
|
|
41
|
+
reranker_options: The options for the reranker.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
query_rephraser_options: QueryRephraserOptionsT | None | NotGiven = NOT_GIVEN
|
|
45
|
+
vector_store_options: VectorStoreOptionsT | None | NotGiven = NOT_GIVEN
|
|
46
|
+
reranker_options: RerankerOptionsT | None | NotGiven = NOT_GIVEN
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DocumentSearchConfig(BaseModel):
|
|
50
|
+
"""
|
|
51
|
+
Schema for the document search config.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
vector_store: ObjectConstructionConfig
|
|
55
|
+
rephraser: ObjectConstructionConfig = ObjectConstructionConfig(type="NoopQueryRephraser")
|
|
56
|
+
reranker: ObjectConstructionConfig = ObjectConstructionConfig(type="NoopReranker")
|
|
57
|
+
ingest_strategy: ObjectConstructionConfig = ObjectConstructionConfig(type="SequentialIngestStrategy")
|
|
58
|
+
parser_router: dict[str, ObjectConstructionConfig] = {}
|
|
59
|
+
enricher_router: dict[str, ObjectConstructionConfig] = {}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DocumentSearch(
|
|
63
|
+
ConfigurableComponent[DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]]
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Main entrypoint to the document search functionality. It provides methods for document retrieval and ingestion.
|
|
67
|
+
|
|
68
|
+
Retrieval:
|
|
69
|
+
1. Uses QueryRephraser to rephrase the query.
|
|
70
|
+
2. Uses VectorStore to retrieve the most relevant elements.
|
|
71
|
+
3. Uses Reranker to rerank the elements.
|
|
72
|
+
|
|
73
|
+
Ingestion:
|
|
74
|
+
1. Uses IngestStrategy to orchestrate ingestion process.
|
|
75
|
+
2. Uses DocumentParserRouter to route the document to the appropriate DocumentParser to parse the content.
|
|
76
|
+
3. Uses ElementEnricherRouter to redirect the element to the appropriate ElementEnricher to enrich the element.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
options_cls: type[DocumentSearchOptions] = DocumentSearchOptions
|
|
80
|
+
default_module: ClassVar[ModuleType | None] = document_search
|
|
81
|
+
configuration_key: ClassVar[str] = "document_search"
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
vector_store: VectorStore[VectorStoreOptionsT],
|
|
86
|
+
*,
|
|
87
|
+
query_rephraser: QueryRephraser[QueryRephraserOptionsT] | None = None,
|
|
88
|
+
reranker: Reranker[RerankerOptionsT] | None = None,
|
|
89
|
+
default_options: DocumentSearchOptions[
|
|
90
|
+
QueryRephraserOptionsT,
|
|
91
|
+
VectorStoreOptionsT,
|
|
92
|
+
RerankerOptionsT,
|
|
93
|
+
]
|
|
94
|
+
| None = None,
|
|
95
|
+
ingest_strategy: IngestStrategy | None = None,
|
|
96
|
+
parser_router: DocumentParserRouter | None = None,
|
|
97
|
+
enricher_router: ElementEnricherRouter | None = None,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""
|
|
100
|
+
Initialize the DocumentSearch instance.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
vector_store: The vector store to use for retrieval.
|
|
104
|
+
query_rephraser: The query rephraser to use for retrieval.
|
|
105
|
+
reranker: The reranker to use for retrieval.
|
|
106
|
+
default_options: The default options for the search.
|
|
107
|
+
ingest_strategy: The ingestion strategy to use for ingestion.
|
|
108
|
+
parser_router: The document parser router to use for ingestion.
|
|
109
|
+
enricher_router: The element enricher router to use for ingestion.
|
|
110
|
+
"""
|
|
111
|
+
super().__init__(default_options=default_options)
|
|
112
|
+
self.vector_store = vector_store
|
|
113
|
+
self.query_rephraser = query_rephraser or NoopQueryRephraser()
|
|
114
|
+
self.reranker = reranker or NoopReranker()
|
|
115
|
+
self.ingest_strategy = ingest_strategy or SequentialIngestStrategy()
|
|
116
|
+
self.parser_router = parser_router or DocumentParserRouter()
|
|
117
|
+
self.enricher_router = enricher_router or ElementEnricherRouter()
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def from_config(cls, config: dict) -> Self:
|
|
121
|
+
"""
|
|
122
|
+
Creates and returns an instance of the DocumentSearch class from the given configuration.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
config: A configuration object containing the configuration for initializing the DocumentSearch instance.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
DocumentSearch: An initialized instance of the DocumentSearch class.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
ValidationError: If the configuration doesn't follow the expected format.
|
|
132
|
+
InvalidConfigError: If one of the specified classes can't be found or is not the correct type.
|
|
133
|
+
"""
|
|
134
|
+
model = DocumentSearchConfig.model_validate(config)
|
|
135
|
+
|
|
136
|
+
query_rephraser: QueryRephraser = QueryRephraser.subclass_from_config(model.rephraser)
|
|
137
|
+
vector_store: VectorStore = VectorStore.subclass_from_config(model.vector_store)
|
|
138
|
+
reranker: Reranker = Reranker.subclass_from_config(model.reranker)
|
|
139
|
+
|
|
140
|
+
ingest_strategy = IngestStrategy.subclass_from_config(model.ingest_strategy)
|
|
141
|
+
parser_router = DocumentParserRouter.from_config(model.parser_router)
|
|
142
|
+
enricher_router = ElementEnricherRouter.from_config(model.enricher_router)
|
|
143
|
+
|
|
144
|
+
return cls(
|
|
145
|
+
vector_store=vector_store,
|
|
146
|
+
query_rephraser=query_rephraser,
|
|
147
|
+
reranker=reranker,
|
|
148
|
+
ingest_strategy=ingest_strategy,
|
|
149
|
+
parser_router=parser_router,
|
|
150
|
+
enricher_router=enricher_router,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def preferred_subclass(
|
|
155
|
+
cls,
|
|
156
|
+
config: CoreConfig,
|
|
157
|
+
factory_path_override: str | None = None,
|
|
158
|
+
yaml_path_override: Path | None = None,
|
|
159
|
+
) -> Self:
|
|
160
|
+
"""
|
|
161
|
+
Tries to create an instance by looking at project's component prefferences, either from YAML
|
|
162
|
+
or from the factory. Takes optional overrides for both, which takes a higher precedence.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
config: The CoreConfig instance containing preferred factory and configuration details.
|
|
166
|
+
factory_path_override: A string representing the path to the factory function
|
|
167
|
+
in the format of "module.submodule:factory_name".
|
|
168
|
+
yaml_path_override: A string representing the path to the YAML file containing
|
|
169
|
+
the Ragstack instance configuration. Looks for the configuration under the key "document_search",
|
|
170
|
+
and if not found, instantiates the class with the preferred configuration for each component.
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
InvalidConfigError: If the default factory or configuration can't be found.
|
|
174
|
+
"""
|
|
175
|
+
if yaml_path_override:
|
|
176
|
+
preferences = get_config_from_yaml(yaml_path_override)
|
|
177
|
+
|
|
178
|
+
# Look for explicit document search configuration
|
|
179
|
+
if type_config := preferences.get(cls.configuration_key):
|
|
180
|
+
return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))
|
|
181
|
+
|
|
182
|
+
# Instantiate the class with the preferred configuration for each component
|
|
183
|
+
return cls.from_config(preferences)
|
|
184
|
+
|
|
185
|
+
if factory_path_override:
|
|
186
|
+
return cls.subclass_from_factory(factory_path_override)
|
|
187
|
+
|
|
188
|
+
if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
|
|
189
|
+
return cls.subclass_from_factory(preferred_factory)
|
|
190
|
+
|
|
191
|
+
if config.component_preference_config_path is not None:
|
|
192
|
+
# Look for explicit document search configuration
|
|
193
|
+
if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
|
|
194
|
+
return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))
|
|
195
|
+
|
|
196
|
+
# Instantiate the class with the preferred configuration for each component
|
|
197
|
+
return cls.from_config(config.preferred_instances_config)
|
|
198
|
+
|
|
199
|
+
raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")
|
|
200
|
+
|
|
201
|
+
async def search(
|
|
202
|
+
self,
|
|
203
|
+
query: str,
|
|
204
|
+
options: DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] | None = None,
|
|
205
|
+
) -> Sequence[Element]:
|
|
206
|
+
"""
|
|
207
|
+
Search for the most relevant chunks for a query.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
query: The query to search for.
|
|
211
|
+
options: The document search retrieval options.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
A list of chunks.
|
|
215
|
+
"""
|
|
216
|
+
merged_options = (self.default_options | options) if options else self.default_options
|
|
217
|
+
query_rephraser_options = merged_options.query_rephraser_options or None
|
|
218
|
+
vector_store_options = merged_options.vector_store_options or None
|
|
219
|
+
reranker_options = merged_options.reranker_options or None
|
|
220
|
+
|
|
221
|
+
with trace(query=query, options=merged_options) as outputs:
|
|
222
|
+
queries = await self.query_rephraser.rephrase(query, query_rephraser_options)
|
|
223
|
+
elements = [
|
|
224
|
+
[
|
|
225
|
+
Element.from_vector_db_entry(result.entry, result.score)
|
|
226
|
+
for result in await self.vector_store.retrieve(query, vector_store_options)
|
|
227
|
+
]
|
|
228
|
+
for query in queries
|
|
229
|
+
]
|
|
230
|
+
outputs.results = await self.reranker.rerank(
|
|
231
|
+
elements=elements,
|
|
232
|
+
query=query,
|
|
233
|
+
options=reranker_options,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return outputs.results
|
|
237
|
+
|
|
238
|
+
@traceable
|
|
239
|
+
async def ingest(
|
|
240
|
+
self,
|
|
241
|
+
documents: str | Iterable[DocumentMeta | Document | Source],
|
|
242
|
+
fail_on_error: bool = True,
|
|
243
|
+
) -> IngestExecutionResult:
|
|
244
|
+
"""
|
|
245
|
+
Ingest documents into the search index.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
documents: A string representing a source-specific URI (e.g., "gcs://bucket/*") or an iterable of
|
|
249
|
+
`Document`, `DocumentMeta`, or `Source` objects. Examples of URI formats include:
|
|
250
|
+
- "file:///path/to/files/*.txt"
|
|
251
|
+
- "gcs://bucket/folder/*"
|
|
252
|
+
- "huggingface://dataset/split/row"
|
|
253
|
+
fail_on_error: If True, raises IngestExecutionError when any errors are encountered during ingestion.
|
|
254
|
+
If False, returns all errors encountered in the IngestExecutionResult.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
An IngestExecutionResult containing the results of the ingestion process.
|
|
258
|
+
|
|
259
|
+
Raises:
|
|
260
|
+
IngestExecutionError: If fail_on_error is True and any errors are encountered during ingestion.
|
|
261
|
+
"""
|
|
262
|
+
resolved_documents = await SourceResolver.resolve(documents) if isinstance(documents, str) else documents
|
|
263
|
+
results = await self.ingest_strategy(
|
|
264
|
+
documents=resolved_documents,
|
|
265
|
+
vector_store=self.vector_store,
|
|
266
|
+
parser_router=self.parser_router,
|
|
267
|
+
enricher_router=self.enricher_router,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if fail_on_error and results.failed:
|
|
271
|
+
raise IngestExecutionError(results.failed)
|
|
272
|
+
|
|
273
|
+
return results
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from ragbits.cli._utils import get_instance_or_exit
|
|
10
|
+
from ragbits.cli.state import print_output
|
|
11
|
+
from ragbits.core.vector_stores.base import VectorStoreOptions
|
|
12
|
+
from ragbits.document_search._main import DocumentSearch, DocumentSearchOptions
|
|
13
|
+
|
|
14
|
+
ds_app = typer.Typer(no_args_is_help=True)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def register(app: typer.Typer) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Register the CLI commands for the package.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
app: The Typer object to register the commands with.
|
|
23
|
+
"""
|
|
24
|
+
app.add_typer(ds_app, name="document-search", help="Commands for interacting with the document search")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class _CLIState:
|
|
29
|
+
document_search: DocumentSearch | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
state: _CLIState = _CLIState()
|
|
33
|
+
|
|
34
|
+
# Default columns for commands that list entries
|
|
35
|
+
_default_columns = "element_type,key"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class IngestedItem(BaseModel):
|
|
39
|
+
"""Model describing ingested source"""
|
|
40
|
+
|
|
41
|
+
source: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@ds_app.callback()
|
|
45
|
+
def common_args(
|
|
46
|
+
factory_path: Annotated[
|
|
47
|
+
str | None,
|
|
48
|
+
typer.Option(
|
|
49
|
+
help="Python path to a function that creates a document search object, "
|
|
50
|
+
"in a 'module.submodule:function' format"
|
|
51
|
+
),
|
|
52
|
+
] = None,
|
|
53
|
+
yaml_path: Annotated[
|
|
54
|
+
Path | None,
|
|
55
|
+
typer.Option(help="Path to a YAML configuration file for the document search", exists=True, resolve_path=True),
|
|
56
|
+
] = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Common arguments for the document search commands.
|
|
60
|
+
"""
|
|
61
|
+
state.document_search = get_instance_or_exit(
|
|
62
|
+
DocumentSearch,
|
|
63
|
+
factory_path=factory_path,
|
|
64
|
+
yaml_path=yaml_path,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@ds_app.command()
|
|
69
|
+
def search(
|
|
70
|
+
query: Annotated[str, typer.Argument(help="Text to query with")],
|
|
71
|
+
k: Annotated[int, typer.Option(help="Number of entries to retrieve")] = 5,
|
|
72
|
+
columns: Annotated[
|
|
73
|
+
str,
|
|
74
|
+
typer.Option(
|
|
75
|
+
help="Comma-separated list of columns to display, "
|
|
76
|
+
"available: id, element_type, key, location, text_representation, document_meta"
|
|
77
|
+
),
|
|
78
|
+
] = _default_columns,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Query the chosen vector store.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
async def run() -> None:
|
|
85
|
+
if state.document_search is None:
|
|
86
|
+
raise ValueError("Document search not initialized")
|
|
87
|
+
|
|
88
|
+
options: DocumentSearchOptions = DocumentSearchOptions(vector_store_options=VectorStoreOptions(k=k))
|
|
89
|
+
entries = await state.document_search.search(query, options)
|
|
90
|
+
print_output(entries, columns=columns)
|
|
91
|
+
|
|
92
|
+
asyncio.run(run())
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@ds_app.command()
|
|
96
|
+
def ingest(
|
|
97
|
+
source: Annotated[str, typer.Argument(help="Source pattern")],
|
|
98
|
+
) -> None:
|
|
99
|
+
"""
|
|
100
|
+
Ingest the elements from a given source to vector store.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
async def run() -> None:
|
|
104
|
+
if state.document_search is None:
|
|
105
|
+
raise ValueError("Document search not initialized")
|
|
106
|
+
await state.document_search.ingest(source)
|
|
107
|
+
print_output(IngestedItem(source=source))
|
|
108
|
+
|
|
109
|
+
asyncio.run(run())
|
|
File without changes
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated, Any
|
|
5
|
+
|
|
6
|
+
import filetype
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
from typing_extensions import deprecated
|
|
9
|
+
|
|
10
|
+
from ragbits.core.sources.base import Source, SourceDiscriminator
|
|
11
|
+
from ragbits.core.sources.local import LocalFileSource
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DocumentType(str, Enum):
|
|
15
|
+
"""
|
|
16
|
+
Document types that can be parsed.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
MD = "md"
|
|
20
|
+
TXT = "txt"
|
|
21
|
+
PDF = "pdf"
|
|
22
|
+
CSV = "csv"
|
|
23
|
+
DOC = "doc"
|
|
24
|
+
DOCX = "docx"
|
|
25
|
+
HTML = "html"
|
|
26
|
+
EPUB = "epub"
|
|
27
|
+
XLSX = "xlsx"
|
|
28
|
+
XLS = "xls"
|
|
29
|
+
ORG = "org"
|
|
30
|
+
ODT = "odt"
|
|
31
|
+
PPT = "ppt"
|
|
32
|
+
PPTX = "pptx"
|
|
33
|
+
RST = "rst"
|
|
34
|
+
RTF = "rtf"
|
|
35
|
+
TSV = "tsv"
|
|
36
|
+
JSON = "json"
|
|
37
|
+
JSONL = "jsonl"
|
|
38
|
+
XML = "xml"
|
|
39
|
+
JPG = "jpg"
|
|
40
|
+
PNG = "png"
|
|
41
|
+
|
|
42
|
+
UNKNOWN = "unknown"
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def _missing_(cls, value: object) -> Any: # noqa: ANN401
|
|
46
|
+
"""
|
|
47
|
+
Return WILDCARD if the value is not found in the enum.
|
|
48
|
+
"""
|
|
49
|
+
return cls.UNKNOWN
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DocumentMeta(BaseModel):
|
|
53
|
+
"""
|
|
54
|
+
An object representing a document metadata.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
document_type: DocumentType
|
|
58
|
+
source: Annotated[Source, SourceDiscriminator()]
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def id(self) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Get the document ID.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
The document ID.
|
|
67
|
+
"""
|
|
68
|
+
return self.source.id
|
|
69
|
+
|
|
70
|
+
async def fetch(self) -> "Document":
|
|
71
|
+
"""
|
|
72
|
+
This method fetches the document from source (potentially remote) and creates an object to interface with it.
|
|
73
|
+
Based on the document type, it will return a different object.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The document.
|
|
77
|
+
"""
|
|
78
|
+
local_path = await self.source.fetch()
|
|
79
|
+
return Document.from_document_meta(self, local_path)
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
@deprecated("Use from_literal() instead")
|
|
83
|
+
def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
|
|
84
|
+
"""
|
|
85
|
+
Create a text document from a literal content. This method is deprecated, use from_literal() instead.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
content: The content of the document.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
The document metadata.
|
|
92
|
+
"""
|
|
93
|
+
return cls.from_literal(content)
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def from_literal(cls, content: str) -> "DocumentMeta":
|
|
97
|
+
"""
|
|
98
|
+
Create a text document from a literal content.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
content: The content of the document.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The document metadata.
|
|
105
|
+
"""
|
|
106
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
|
107
|
+
temp_file.write(content.encode())
|
|
108
|
+
|
|
109
|
+
return cls(
|
|
110
|
+
document_type=DocumentType.TXT,
|
|
111
|
+
source=LocalFileSource(path=Path(temp_file.name)),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_local_path(cls, local_path: Path) -> "DocumentMeta":
|
|
116
|
+
"""
|
|
117
|
+
Create a document metadata from a local path.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
local_path: The local path to the document.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
The document metadata.
|
|
124
|
+
"""
|
|
125
|
+
return cls(
|
|
126
|
+
document_type=cls._infer_document_type(local_path),
|
|
127
|
+
source=LocalFileSource(path=local_path),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
async def from_source(cls, source: Source) -> "DocumentMeta":
|
|
132
|
+
"""
|
|
133
|
+
Create a document metadata from a source.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
source: The source from which the document is fetched.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
The document metadata.
|
|
140
|
+
"""
|
|
141
|
+
path = await source.fetch()
|
|
142
|
+
|
|
143
|
+
return cls(
|
|
144
|
+
document_type=cls._infer_document_type(path),
|
|
145
|
+
source=source,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _infer_document_type(path: Path) -> DocumentType:
|
|
150
|
+
"""
|
|
151
|
+
Infer the document type by checking the file signature. Use the file extension as a fallback.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
path: The path to the file.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
The inferred document type.
|
|
158
|
+
"""
|
|
159
|
+
if kind := filetype.guess(path):
|
|
160
|
+
return DocumentType(kind.extension)
|
|
161
|
+
return DocumentType(path.suffix[1:])
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class Document(BaseModel):
|
|
165
|
+
"""
|
|
166
|
+
An object representing a document which is downloaded and stored locally.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
local_path: Path
|
|
170
|
+
metadata: DocumentMeta
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
|
|
174
|
+
"""
|
|
175
|
+
Create a document from a document metadata.
|
|
176
|
+
Based on the document type, it will return a different object.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
document_meta: The document metadata.
|
|
180
|
+
local_path: The local path to the document.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
The document.
|
|
184
|
+
"""
|
|
185
|
+
if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
|
|
186
|
+
return TextDocument(local_path=local_path, metadata=document_meta)
|
|
187
|
+
return cls(local_path=local_path, metadata=document_meta)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class TextDocument(Document):
|
|
191
|
+
"""
|
|
192
|
+
An object representing a text document.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def content(self) -> str:
|
|
197
|
+
"""
|
|
198
|
+
Get the content of the document.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
The content of the document.
|
|
202
|
+
"""
|
|
203
|
+
return self.local_path.read_text()
|