ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ragbits/document_search/__init__.py +3 -0
  2. ragbits/document_search/_main.py +273 -0
  3. ragbits/document_search/cli.py +109 -0
  4. ragbits/document_search/documents/__init__.py +0 -0
  5. ragbits/document_search/documents/document.py +203 -0
  6. ragbits/document_search/documents/element.py +208 -0
  7. ragbits/document_search/ingestion/__init__.py +0 -0
  8. ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
  9. ragbits/document_search/ingestion/enrichers/base.py +64 -0
  10. ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
  11. ragbits/document_search/ingestion/enrichers/image.py +107 -0
  12. ragbits/document_search/ingestion/enrichers/router.py +86 -0
  13. ragbits/document_search/ingestion/parsers/__init__.py +9 -0
  14. ragbits/document_search/ingestion/parsers/base.py +97 -0
  15. ragbits/document_search/ingestion/parsers/docling.py +178 -0
  16. ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
  17. ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
  18. ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
  19. ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
  20. ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
  21. ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
  22. ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
  23. ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
  24. ragbits/document_search/ingestion/parsers/router.py +90 -0
  25. ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
  26. ragbits/document_search/ingestion/strategies/__init__.py +6 -0
  27. ragbits/document_search/ingestion/strategies/base.py +290 -0
  28. ragbits/document_search/ingestion/strategies/batched.py +261 -0
  29. ragbits/document_search/ingestion/strategies/ray.py +138 -0
  30. ragbits/document_search/ingestion/strategies/sequential.py +23 -0
  31. ragbits/document_search/py.typed +0 -0
  32. ragbits/document_search/retrieval/__init__.py +0 -0
  33. ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
  34. ragbits/document_search/retrieval/rephrasers/base.py +39 -0
  35. ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
  36. ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
  37. ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
  38. ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
  39. ragbits/document_search/retrieval/rerankers/base.py +56 -0
  40. ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
  41. ragbits/document_search/retrieval/rerankers/llm.py +177 -0
  42. ragbits/document_search/retrieval/rerankers/noop.py +34 -0
  43. ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
  44. ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
  45. ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
  46. ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
@@ -0,0 +1,3 @@
1
+ from ragbits.document_search._main import DocumentSearch, DocumentSearchOptions
2
+
3
+ __all__ = ["DocumentSearch", "DocumentSearchOptions"]
@@ -0,0 +1,273 @@
1
+ from collections.abc import Iterable, Sequence
2
+ from pathlib import Path
3
+ from types import ModuleType
4
+ from typing import ClassVar, Generic
5
+
6
+ from pydantic import BaseModel
7
+ from typing_extensions import Self
8
+
9
+ from ragbits import document_search
10
+ from ragbits.core.audit.traces import trace, traceable
11
+ from ragbits.core.config import CoreConfig
12
+ from ragbits.core.options import Options
13
+ from ragbits.core.sources.base import Source, SourceResolver
14
+ from ragbits.core.types import NOT_GIVEN, NotGiven
15
+ from ragbits.core.utils._pyproject import get_config_from_yaml
16
+ from ragbits.core.utils.config_handling import ConfigurableComponent, NoPreferredConfigError, ObjectConstructionConfig
17
+ from ragbits.core.vector_stores.base import VectorStore, VectorStoreOptionsT
18
+ from ragbits.document_search.documents.document import Document, DocumentMeta
19
+ from ragbits.document_search.documents.element import Element
20
+ from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
21
+ from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
22
+ from ragbits.document_search.ingestion.strategies.base import (
23
+ IngestExecutionError,
24
+ IngestExecutionResult,
25
+ IngestStrategy,
26
+ )
27
+ from ragbits.document_search.ingestion.strategies.sequential import SequentialIngestStrategy
28
+ from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptionsT
29
+ from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
30
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptionsT
31
+ from ragbits.document_search.retrieval.rerankers.noop import NoopReranker
32
+
33
+
34
+ class DocumentSearchOptions(Options, Generic[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]):
35
+ """
36
+ Object representing the options for the document search.
37
+
38
+ Attributes:
39
+ query_rephraser_options: The options for the query rephraser.
40
+ vector_store_options: The options for the vector store.
41
+ reranker_options: The options for the reranker.
42
+ """
43
+
44
+ query_rephraser_options: QueryRephraserOptionsT | None | NotGiven = NOT_GIVEN
45
+ vector_store_options: VectorStoreOptionsT | None | NotGiven = NOT_GIVEN
46
+ reranker_options: RerankerOptionsT | None | NotGiven = NOT_GIVEN
47
+
48
+
49
+ class DocumentSearchConfig(BaseModel):
50
+ """
51
+ Schema for the document search config.
52
+ """
53
+
54
+ vector_store: ObjectConstructionConfig
55
+ rephraser: ObjectConstructionConfig = ObjectConstructionConfig(type="NoopQueryRephraser")
56
+ reranker: ObjectConstructionConfig = ObjectConstructionConfig(type="NoopReranker")
57
+ ingest_strategy: ObjectConstructionConfig = ObjectConstructionConfig(type="SequentialIngestStrategy")
58
+ parser_router: dict[str, ObjectConstructionConfig] = {}
59
+ enricher_router: dict[str, ObjectConstructionConfig] = {}
60
+
61
+
62
+ class DocumentSearch(
63
+ ConfigurableComponent[DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]]
64
+ ):
65
+ """
66
+ Main entrypoint to the document search functionality. It provides methods for document retrieval and ingestion.
67
+
68
+ Retrieval:
69
+ 1. Uses QueryRephraser to rephrase the query.
70
+ 2. Uses VectorStore to retrieve the most relevant elements.
71
+ 3. Uses Reranker to rerank the elements.
72
+
73
+ Ingestion:
74
+ 1. Uses IngestStrategy to orchestrate ingestion process.
75
+ 2. Uses DocumentParserRouter to route the document to the appropriate DocumentParser to parse the content.
76
+ 3. Uses ElementEnricherRouter to redirect the element to the appropriate ElementEnricher to enrich the element.
77
+ """
78
+
79
+ options_cls: type[DocumentSearchOptions] = DocumentSearchOptions
80
+ default_module: ClassVar[ModuleType | None] = document_search
81
+ configuration_key: ClassVar[str] = "document_search"
82
+
83
+ def __init__(
84
+ self,
85
+ vector_store: VectorStore[VectorStoreOptionsT],
86
+ *,
87
+ query_rephraser: QueryRephraser[QueryRephraserOptionsT] | None = None,
88
+ reranker: Reranker[RerankerOptionsT] | None = None,
89
+ default_options: DocumentSearchOptions[
90
+ QueryRephraserOptionsT,
91
+ VectorStoreOptionsT,
92
+ RerankerOptionsT,
93
+ ]
94
+ | None = None,
95
+ ingest_strategy: IngestStrategy | None = None,
96
+ parser_router: DocumentParserRouter | None = None,
97
+ enricher_router: ElementEnricherRouter | None = None,
98
+ ) -> None:
99
+ """
100
+ Initialize the DocumentSearch instance.
101
+
102
+ Args:
103
+ vector_store: The vector store to use for retrieval.
104
+ query_rephraser: The query rephraser to use for retrieval.
105
+ reranker: The reranker to use for retrieval.
106
+ default_options: The default options for the search.
107
+ ingest_strategy: The ingestion strategy to use for ingestion.
108
+ parser_router: The document parser router to use for ingestion.
109
+ enricher_router: The element enricher router to use for ingestion.
110
+ """
111
+ super().__init__(default_options=default_options)
112
+ self.vector_store = vector_store
113
+ self.query_rephraser = query_rephraser or NoopQueryRephraser()
114
+ self.reranker = reranker or NoopReranker()
115
+ self.ingest_strategy = ingest_strategy or SequentialIngestStrategy()
116
+ self.parser_router = parser_router or DocumentParserRouter()
117
+ self.enricher_router = enricher_router or ElementEnricherRouter()
118
+
119
+ @classmethod
120
+ def from_config(cls, config: dict) -> Self:
121
+ """
122
+ Creates and returns an instance of the DocumentSearch class from the given configuration.
123
+
124
+ Args:
125
+ config: A configuration object containing the configuration for initializing the DocumentSearch instance.
126
+
127
+ Returns:
128
+ DocumentSearch: An initialized instance of the DocumentSearch class.
129
+
130
+ Raises:
131
+ ValidationError: If the configuration doesn't follow the expected format.
132
+ InvalidConfigError: If one of the specified classes can't be found or is not the correct type.
133
+ """
134
+ model = DocumentSearchConfig.model_validate(config)
135
+
136
+ query_rephraser: QueryRephraser = QueryRephraser.subclass_from_config(model.rephraser)
137
+ vector_store: VectorStore = VectorStore.subclass_from_config(model.vector_store)
138
+ reranker: Reranker = Reranker.subclass_from_config(model.reranker)
139
+
140
+ ingest_strategy = IngestStrategy.subclass_from_config(model.ingest_strategy)
141
+ parser_router = DocumentParserRouter.from_config(model.parser_router)
142
+ enricher_router = ElementEnricherRouter.from_config(model.enricher_router)
143
+
144
+ return cls(
145
+ vector_store=vector_store,
146
+ query_rephraser=query_rephraser,
147
+ reranker=reranker,
148
+ ingest_strategy=ingest_strategy,
149
+ parser_router=parser_router,
150
+ enricher_router=enricher_router,
151
+ )
152
+
153
+ @classmethod
154
+ def preferred_subclass(
155
+ cls,
156
+ config: CoreConfig,
157
+ factory_path_override: str | None = None,
158
+ yaml_path_override: Path | None = None,
159
+ ) -> Self:
160
+ """
161
+ Tries to create an instance by looking at project's component prefferences, either from YAML
162
+ or from the factory. Takes optional overrides for both, which takes a higher precedence.
163
+
164
+ Args:
165
+ config: The CoreConfig instance containing preferred factory and configuration details.
166
+ factory_path_override: A string representing the path to the factory function
167
+ in the format of "module.submodule:factory_name".
168
+ yaml_path_override: A string representing the path to the YAML file containing
169
+ the Ragstack instance configuration. Looks for the configuration under the key "document_search",
170
+ and if not found, instantiates the class with the preferred configuration for each component.
171
+
172
+ Raises:
173
+ InvalidConfigError: If the default factory or configuration can't be found.
174
+ """
175
+ if yaml_path_override:
176
+ preferences = get_config_from_yaml(yaml_path_override)
177
+
178
+ # Look for explicit document search configuration
179
+ if type_config := preferences.get(cls.configuration_key):
180
+ return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))
181
+
182
+ # Instantiate the class with the preferred configuration for each component
183
+ return cls.from_config(preferences)
184
+
185
+ if factory_path_override:
186
+ return cls.subclass_from_factory(factory_path_override)
187
+
188
+ if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
189
+ return cls.subclass_from_factory(preferred_factory)
190
+
191
+ if config.component_preference_config_path is not None:
192
+ # Look for explicit document search configuration
193
+ if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
194
+ return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))
195
+
196
+ # Instantiate the class with the preferred configuration for each component
197
+ return cls.from_config(config.preferred_instances_config)
198
+
199
+ raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")
200
+
201
+ async def search(
202
+ self,
203
+ query: str,
204
+ options: DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] | None = None,
205
+ ) -> Sequence[Element]:
206
+ """
207
+ Search for the most relevant chunks for a query.
208
+
209
+ Args:
210
+ query: The query to search for.
211
+ options: The document search retrieval options.
212
+
213
+ Returns:
214
+ A list of chunks.
215
+ """
216
+ merged_options = (self.default_options | options) if options else self.default_options
217
+ query_rephraser_options = merged_options.query_rephraser_options or None
218
+ vector_store_options = merged_options.vector_store_options or None
219
+ reranker_options = merged_options.reranker_options or None
220
+
221
+ with trace(query=query, options=merged_options) as outputs:
222
+ queries = await self.query_rephraser.rephrase(query, query_rephraser_options)
223
+ elements = [
224
+ [
225
+ Element.from_vector_db_entry(result.entry, result.score)
226
+ for result in await self.vector_store.retrieve(query, vector_store_options)
227
+ ]
228
+ for query in queries
229
+ ]
230
+ outputs.results = await self.reranker.rerank(
231
+ elements=elements,
232
+ query=query,
233
+ options=reranker_options,
234
+ )
235
+
236
+ return outputs.results
237
+
238
+ @traceable
239
+ async def ingest(
240
+ self,
241
+ documents: str | Iterable[DocumentMeta | Document | Source],
242
+ fail_on_error: bool = True,
243
+ ) -> IngestExecutionResult:
244
+ """
245
+ Ingest documents into the search index.
246
+
247
+ Args:
248
+ documents: A string representing a source-specific URI (e.g., "gcs://bucket/*") or an iterable of
249
+ `Document`, `DocumentMeta`, or `Source` objects. Examples of URI formats include:
250
+ - "file:///path/to/files/*.txt"
251
+ - "gcs://bucket/folder/*"
252
+ - "huggingface://dataset/split/row"
253
+ fail_on_error: If True, raises IngestExecutionError when any errors are encountered during ingestion.
254
+ If False, returns all errors encountered in the IngestExecutionResult.
255
+
256
+ Returns:
257
+ An IngestExecutionResult containing the results of the ingestion process.
258
+
259
+ Raises:
260
+ IngestExecutionError: If fail_on_error is True and any errors are encountered during ingestion.
261
+ """
262
+ resolved_documents = await SourceResolver.resolve(documents) if isinstance(documents, str) else documents
263
+ results = await self.ingest_strategy(
264
+ documents=resolved_documents,
265
+ vector_store=self.vector_store,
266
+ parser_router=self.parser_router,
267
+ enricher_router=self.enricher_router,
268
+ )
269
+
270
+ if fail_on_error and results.failed:
271
+ raise IngestExecutionError(results.failed)
272
+
273
+ return results
@@ -0,0 +1,109 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Annotated
5
+
6
+ import typer
7
+ from pydantic import BaseModel
8
+
9
+ from ragbits.cli._utils import get_instance_or_exit
10
+ from ragbits.cli.state import print_output
11
+ from ragbits.core.vector_stores.base import VectorStoreOptions
12
+ from ragbits.document_search._main import DocumentSearch, DocumentSearchOptions
13
+
14
+ ds_app = typer.Typer(no_args_is_help=True)
15
+
16
+
17
+ def register(app: typer.Typer) -> None:
18
+ """
19
+ Register the CLI commands for the package.
20
+
21
+ Args:
22
+ app: The Typer object to register the commands with.
23
+ """
24
+ app.add_typer(ds_app, name="document-search", help="Commands for interacting with the document search")
25
+
26
+
27
+ @dataclass
28
+ class _CLIState:
29
+ document_search: DocumentSearch | None = None
30
+
31
+
32
+ state: _CLIState = _CLIState()
33
+
34
+ # Default columns for commands that list entries
35
+ _default_columns = "element_type,key"
36
+
37
+
38
+ class IngestedItem(BaseModel):
39
+ """Model describing ingested source"""
40
+
41
+ source: str
42
+
43
+
44
+ @ds_app.callback()
45
+ def common_args(
46
+ factory_path: Annotated[
47
+ str | None,
48
+ typer.Option(
49
+ help="Python path to a function that creates a document search object, "
50
+ "in a 'module.submodule:function' format"
51
+ ),
52
+ ] = None,
53
+ yaml_path: Annotated[
54
+ Path | None,
55
+ typer.Option(help="Path to a YAML configuration file for the document search", exists=True, resolve_path=True),
56
+ ] = None,
57
+ ) -> None:
58
+ """
59
+ Common arguments for the document search commands.
60
+ """
61
+ state.document_search = get_instance_or_exit(
62
+ DocumentSearch,
63
+ factory_path=factory_path,
64
+ yaml_path=yaml_path,
65
+ )
66
+
67
+
68
+ @ds_app.command()
69
+ def search(
70
+ query: Annotated[str, typer.Argument(help="Text to query with")],
71
+ k: Annotated[int, typer.Option(help="Number of entries to retrieve")] = 5,
72
+ columns: Annotated[
73
+ str,
74
+ typer.Option(
75
+ help="Comma-separated list of columns to display, "
76
+ "available: id, element_type, key, location, text_representation, document_meta"
77
+ ),
78
+ ] = _default_columns,
79
+ ) -> None:
80
+ """
81
+ Query the chosen vector store.
82
+ """
83
+
84
+ async def run() -> None:
85
+ if state.document_search is None:
86
+ raise ValueError("Document search not initialized")
87
+
88
+ options: DocumentSearchOptions = DocumentSearchOptions(vector_store_options=VectorStoreOptions(k=k))
89
+ entries = await state.document_search.search(query, options)
90
+ print_output(entries, columns=columns)
91
+
92
+ asyncio.run(run())
93
+
94
+
95
+ @ds_app.command()
96
+ def ingest(
97
+ source: Annotated[str, typer.Argument(help="Source pattern")],
98
+ ) -> None:
99
+ """
100
+ Ingest the elements from a given source to vector store.
101
+ """
102
+
103
+ async def run() -> None:
104
+ if state.document_search is None:
105
+ raise ValueError("Document search not initialized")
106
+ await state.document_search.ingest(source)
107
+ print_output(IngestedItem(source=source))
108
+
109
+ asyncio.run(run())
File without changes
@@ -0,0 +1,203 @@
1
+ import tempfile
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Annotated, Any
5
+
6
+ import filetype
7
+ from pydantic import BaseModel
8
+ from typing_extensions import deprecated
9
+
10
+ from ragbits.core.sources.base import Source, SourceDiscriminator
11
+ from ragbits.core.sources.local import LocalFileSource
12
+
13
+
14
+ class DocumentType(str, Enum):
15
+ """
16
+ Document types that can be parsed.
17
+ """
18
+
19
+ MD = "md"
20
+ TXT = "txt"
21
+ PDF = "pdf"
22
+ CSV = "csv"
23
+ DOC = "doc"
24
+ DOCX = "docx"
25
+ HTML = "html"
26
+ EPUB = "epub"
27
+ XLSX = "xlsx"
28
+ XLS = "xls"
29
+ ORG = "org"
30
+ ODT = "odt"
31
+ PPT = "ppt"
32
+ PPTX = "pptx"
33
+ RST = "rst"
34
+ RTF = "rtf"
35
+ TSV = "tsv"
36
+ JSON = "json"
37
+ JSONL = "jsonl"
38
+ XML = "xml"
39
+ JPG = "jpg"
40
+ PNG = "png"
41
+
42
+ UNKNOWN = "unknown"
43
+
44
+ @classmethod
45
+ def _missing_(cls, value: object) -> Any: # noqa: ANN401
46
+ """
47
+ Return WILDCARD if the value is not found in the enum.
48
+ """
49
+ return cls.UNKNOWN
50
+
51
+
52
+ class DocumentMeta(BaseModel):
53
+ """
54
+ An object representing a document metadata.
55
+ """
56
+
57
+ document_type: DocumentType
58
+ source: Annotated[Source, SourceDiscriminator()]
59
+
60
+ @property
61
+ def id(self) -> str:
62
+ """
63
+ Get the document ID.
64
+
65
+ Returns:
66
+ The document ID.
67
+ """
68
+ return self.source.id
69
+
70
+ async def fetch(self) -> "Document":
71
+ """
72
+ This method fetches the document from source (potentially remote) and creates an object to interface with it.
73
+ Based on the document type, it will return a different object.
74
+
75
+ Returns:
76
+ The document.
77
+ """
78
+ local_path = await self.source.fetch()
79
+ return Document.from_document_meta(self, local_path)
80
+
81
+ @classmethod
82
+ @deprecated("Use from_literal() instead")
83
+ def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
84
+ """
85
+ Create a text document from a literal content. This method is deprecated, use from_literal() instead.
86
+
87
+ Args:
88
+ content: The content of the document.
89
+
90
+ Returns:
91
+ The document metadata.
92
+ """
93
+ return cls.from_literal(content)
94
+
95
+ @classmethod
96
+ def from_literal(cls, content: str) -> "DocumentMeta":
97
+ """
98
+ Create a text document from a literal content.
99
+
100
+ Args:
101
+ content: The content of the document.
102
+
103
+ Returns:
104
+ The document metadata.
105
+ """
106
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
107
+ temp_file.write(content.encode())
108
+
109
+ return cls(
110
+ document_type=DocumentType.TXT,
111
+ source=LocalFileSource(path=Path(temp_file.name)),
112
+ )
113
+
114
+ @classmethod
115
+ def from_local_path(cls, local_path: Path) -> "DocumentMeta":
116
+ """
117
+ Create a document metadata from a local path.
118
+
119
+ Args:
120
+ local_path: The local path to the document.
121
+
122
+ Returns:
123
+ The document metadata.
124
+ """
125
+ return cls(
126
+ document_type=cls._infer_document_type(local_path),
127
+ source=LocalFileSource(path=local_path),
128
+ )
129
+
130
+ @classmethod
131
+ async def from_source(cls, source: Source) -> "DocumentMeta":
132
+ """
133
+ Create a document metadata from a source.
134
+
135
+ Args:
136
+ source: The source from which the document is fetched.
137
+
138
+ Returns:
139
+ The document metadata.
140
+ """
141
+ path = await source.fetch()
142
+
143
+ return cls(
144
+ document_type=cls._infer_document_type(path),
145
+ source=source,
146
+ )
147
+
148
+ @staticmethod
149
+ def _infer_document_type(path: Path) -> DocumentType:
150
+ """
151
+ Infer the document type by checking the file signature. Use the file extension as a fallback.
152
+
153
+ Args:
154
+ path: The path to the file.
155
+
156
+ Returns:
157
+ The inferred document type.
158
+ """
159
+ if kind := filetype.guess(path):
160
+ return DocumentType(kind.extension)
161
+ return DocumentType(path.suffix[1:])
162
+
163
+
164
+ class Document(BaseModel):
165
+ """
166
+ An object representing a document which is downloaded and stored locally.
167
+ """
168
+
169
+ local_path: Path
170
+ metadata: DocumentMeta
171
+
172
+ @classmethod
173
+ def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
174
+ """
175
+ Create a document from a document metadata.
176
+ Based on the document type, it will return a different object.
177
+
178
+ Args:
179
+ document_meta: The document metadata.
180
+ local_path: The local path to the document.
181
+
182
+ Returns:
183
+ The document.
184
+ """
185
+ if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
186
+ return TextDocument(local_path=local_path, metadata=document_meta)
187
+ return cls(local_path=local_path, metadata=document_meta)
188
+
189
+
190
+ class TextDocument(Document):
191
+ """
192
+ An object representing a text document.
193
+ """
194
+
195
+ @property
196
+ def content(self) -> str:
197
+ """
198
+ Get the content of the document.
199
+
200
+ Returns:
201
+ The content of the document.
202
+ """
203
+ return self.local_path.read_text()