ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ragbits/document_search/__init__.py +3 -0
  2. ragbits/document_search/_main.py +273 -0
  3. ragbits/document_search/cli.py +109 -0
  4. ragbits/document_search/documents/__init__.py +0 -0
  5. ragbits/document_search/documents/document.py +203 -0
  6. ragbits/document_search/documents/element.py +208 -0
  7. ragbits/document_search/ingestion/__init__.py +0 -0
  8. ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
  9. ragbits/document_search/ingestion/enrichers/base.py +64 -0
  10. ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
  11. ragbits/document_search/ingestion/enrichers/image.py +107 -0
  12. ragbits/document_search/ingestion/enrichers/router.py +86 -0
  13. ragbits/document_search/ingestion/parsers/__init__.py +9 -0
  14. ragbits/document_search/ingestion/parsers/base.py +97 -0
  15. ragbits/document_search/ingestion/parsers/docling.py +178 -0
  16. ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
  17. ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
  18. ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
  19. ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
  20. ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
  21. ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
  22. ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
  23. ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
  24. ragbits/document_search/ingestion/parsers/router.py +90 -0
  25. ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
  26. ragbits/document_search/ingestion/strategies/__init__.py +6 -0
  27. ragbits/document_search/ingestion/strategies/base.py +290 -0
  28. ragbits/document_search/ingestion/strategies/batched.py +261 -0
  29. ragbits/document_search/ingestion/strategies/ray.py +138 -0
  30. ragbits/document_search/ingestion/strategies/sequential.py +23 -0
  31. ragbits/document_search/py.typed +0 -0
  32. ragbits/document_search/retrieval/__init__.py +0 -0
  33. ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
  34. ragbits/document_search/retrieval/rephrasers/base.py +39 -0
  35. ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
  36. ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
  37. ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
  38. ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
  39. ragbits/document_search/retrieval/rerankers/base.py +56 -0
  40. ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
  41. ragbits/document_search/retrieval/rerankers/llm.py +177 -0
  42. ragbits/document_search/retrieval/rerankers/noop.py +34 -0
  43. ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
  44. ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
  45. ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
  46. ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
@@ -0,0 +1,82 @@
1
+ from collections.abc import Sequence
2
+ from itertools import chain
3
+ from typing import Any
4
+
5
+ from rerankers import Reranker as AnswerReranker
6
+
7
+ from ragbits.core.audit.traces import trace
8
+ from ragbits.document_search.documents.element import Element
9
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
10
+
11
+
12
+ class AnswerAIReranker(Reranker[RerankerOptions]):
13
+ """
14
+ A [rerankers](https://github.com/AnswerDotAI/rerankers) re-ranker covering most popular re-ranking methods.
15
+ """
16
+
17
+ options_cls: type[RerankerOptions] = RerankerOptions
18
+
19
+ def __init__(
20
+ self,
21
+ model: str,
22
+ default_options: RerankerOptions | None = None,
23
+ **rerankers_kwargs: Any, # noqa: ANN401
24
+ ) -> None:
25
+ """
26
+ Initialize the AnswerAIReranker instance.
27
+
28
+ Args:
29
+ model: The reranker model to use.
30
+ default_options: The default options for reranking.
31
+ **rerankers_kwargs: Additional keyword arguments native to rerankers lib.
32
+ """
33
+ super().__init__(default_options=default_options)
34
+ self.model = model
35
+ self.ranker = AnswerReranker(self.model, **rerankers_kwargs)
36
+
37
+ async def rerank(
38
+ self,
39
+ elements: Sequence[Sequence[Element]],
40
+ query: str,
41
+ options: RerankerOptions | None = None,
42
+ ) -> Sequence[Element]:
43
+ """
44
+ Rerank elements.
45
+
46
+ Args:
47
+ elements: The elements to rerank.
48
+ query: The query to rerank the elements against.
49
+ options: The options for reranking.
50
+
51
+ Returns:
52
+ The reranked elements.
53
+
54
+ Raises:
55
+ ValueError: Raised if the input query is empty or if the list of candidate documents is empty.
56
+ TypeError: Raised if the input types are incorrect, such as if the query is not a string, or List[str].
57
+ IndexError: Raised if docs is an empty List.
58
+ """
59
+ merged_options = (self.default_options | options) if options else self.default_options
60
+ flat_elements = list(chain.from_iterable(elements))
61
+ documents = [element.text_representation or "" for element in flat_elements]
62
+
63
+ with trace(
64
+ query=query, documents=documents, elements=elements, model=self.model, options=merged_options
65
+ ) as outputs:
66
+ response = self.ranker.rank(
67
+ query=query,
68
+ docs=documents,
69
+ )
70
+ if merged_options.top_n:
71
+ response = response.top_k(merged_options.top_n)
72
+
73
+ results = []
74
+ for result in response:
75
+ if not merged_options.score_threshold or result.score >= merged_options.score_threshold:
76
+ if merged_options.override_score:
77
+ flat_elements[result.document.doc_id].score = result.score
78
+ results.append(flat_elements[result.document.doc_id])
79
+
80
+ outputs.results = results
81
+
82
+ return outputs.results
@@ -0,0 +1,56 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Sequence
3
+ from typing import ClassVar, TypeVar
4
+
5
+ from ragbits.core.options import Options
6
+ from ragbits.core.types import NOT_GIVEN, NotGiven
7
+ from ragbits.core.utils.config_handling import ConfigurableComponent
8
+ from ragbits.document_search.documents.element import Element
9
+ from ragbits.document_search.retrieval import rerankers
10
+
11
+
12
+ class RerankerOptions(Options):
13
+ """
14
+ Object representing the options for the reranker.
15
+
16
+ Attributes:
17
+ top_n: The number of entries to return.
18
+ score_threshold: The minimum relevance score for an entry to be returned.
19
+ override_score: If True reranking will override element score.
20
+ """
21
+
22
+ top_n: int | None | NotGiven = NOT_GIVEN
23
+ score_threshold: float | None | NotGiven = NOT_GIVEN
24
+ override_score: bool = True
25
+
26
+
27
+ RerankerOptionsT = TypeVar("RerankerOptionsT", bound=RerankerOptions)
28
+
29
+
30
+ class Reranker(ConfigurableComponent[RerankerOptionsT], ABC):
31
+ """
32
+ Reranks elements retrieved from vector store.
33
+ """
34
+
35
+ options_cls: type[RerankerOptionsT]
36
+ default_module: ClassVar = rerankers
37
+ configuration_key: ClassVar = "reranker"
38
+
39
+ @abstractmethod
40
+ async def rerank(
41
+ self,
42
+ elements: Sequence[Sequence[Element]],
43
+ query: str,
44
+ options: RerankerOptionsT | None = None,
45
+ ) -> Sequence[Element]:
46
+ """
47
+ Rerank elements.
48
+
49
+ Args:
50
+ elements: The elements to rerank.
51
+ query: The query to rerank the elements against.
52
+ options: The options for reranking.
53
+
54
+ Returns:
55
+ The reranked elements.
56
+ """
@@ -0,0 +1,85 @@
1
+ from collections.abc import Sequence
2
+ from itertools import chain
3
+
4
+ import litellm
5
+
6
+ from ragbits.core.audit.traces import traceable
7
+ from ragbits.core.types import NOT_GIVEN, NotGiven
8
+ from ragbits.document_search.documents.element import Element
9
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
10
+
11
+
12
+ class LiteLLMRerankerOptions(RerankerOptions):
13
+ """
14
+ Object representing the options for the litellm reranker.
15
+
16
+ Attributes:
17
+ top_n: The number of entries to return.
18
+ score_threshold: The minimum relevance score for an entry to be returned.
19
+ override_score: If True reranking will override element score.
20
+ max_chunks_per_doc: The maximum amount of tokens a document can have before truncation.
21
+ """
22
+
23
+ max_chunks_per_doc: int | None | NotGiven = NOT_GIVEN
24
+
25
+
26
+ class LiteLLMReranker(Reranker[LiteLLMRerankerOptions]):
27
+ """
28
+ A [LiteLLM](https://docs.litellm.ai/docs/rerank) reranker for providers such as Cohere, Together AI, Azure AI.
29
+ """
30
+
31
+ options_cls: type[LiteLLMRerankerOptions] = LiteLLMRerankerOptions
32
+
33
+ def __init__(
34
+ self,
35
+ model: str,
36
+ default_options: LiteLLMRerankerOptions | None = None,
37
+ ) -> None:
38
+ """
39
+ Initialize the LiteLLMReranker instance.
40
+
41
+ Args:
42
+ model: The reranker model to use.
43
+ default_options: The default options for reranking.
44
+ """
45
+ super().__init__(default_options=default_options)
46
+ self.model = model
47
+
48
+ @traceable
49
+ async def rerank(
50
+ self,
51
+ elements: Sequence[Sequence[Element]],
52
+ query: str,
53
+ options: LiteLLMRerankerOptions | None = None,
54
+ ) -> Sequence[Element]:
55
+ """
56
+ Rerank elements with LiteLLM API.
57
+
58
+ Args:
59
+ elements: The elements to rerank.
60
+ query: The query to rerank the elements against.
61
+ options: The options for reranking.
62
+
63
+ Returns:
64
+ The reranked elements.
65
+ """
66
+ merged_options = (self.default_options | options) if options else self.default_options
67
+ flat_elements = list(chain.from_iterable(elements))
68
+ documents = [element.text_representation or "" for element in flat_elements]
69
+
70
+ response = await litellm.arerank(
71
+ model=self.model,
72
+ query=query,
73
+ documents=documents,
74
+ top_n=merged_options.top_n or None,
75
+ max_chunks_per_doc=merged_options.max_chunks_per_doc or None,
76
+ )
77
+
78
+ results = []
79
+ for result in response.results:
80
+ if not merged_options.score_threshold or result["relevance_score"] >= merged_options.score_threshold:
81
+ if merged_options.override_score:
82
+ flat_elements[result["index"]].score = result["relevance_score"]
83
+ results.append(flat_elements[result["index"]])
84
+
85
+ return results
@@ -0,0 +1,177 @@
1
+ import math
2
+ from collections.abc import Sequence
3
+ from itertools import chain
4
+
5
+ from pydantic import BaseModel
6
+ from typing_extensions import Self
7
+
8
+ from ragbits.core.audit.traces import traceable
9
+ from ragbits.core.llms.base import LLM
10
+ from ragbits.core.llms.litellm import LiteLLM, LiteLLMOptions
11
+ from ragbits.core.prompt.prompt import Prompt
12
+ from ragbits.core.types import NOT_GIVEN, NotGiven
13
+ from ragbits.core.utils.config_handling import ObjectConstructionConfig, import_by_path
14
+ from ragbits.document_search.documents.element import Element
15
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
16
+
17
+
18
+ class RerankerInput(BaseModel):
19
+ """
20
+ Input data for the document reranker.
21
+ """
22
+
23
+ query: str
24
+ document: str
25
+
26
+
27
+ class RerankerPrompt(Prompt[RerankerInput, str]):
28
+ """
29
+ Prompt for reranking documents.
30
+ """
31
+
32
+ system_prompt = """
33
+ You are an Assistant responsible for helping detect whether the retrieved document is relevant to the query.
34
+ For a given input, you need to output a single token: "Yes" or "No" indicating the retrieved document is relevant to the query.
35
+ """ # noqa: E501
36
+ user_prompt = """
37
+ Query: {{query}}
38
+ Document: {{document}}
39
+ Relevant:
40
+ """
41
+
42
+
43
+ class LLMRerankerOptions(RerankerOptions):
44
+ """
45
+ Object representing the options for the llm reranker.
46
+
47
+ Attributes:
48
+ top_n: The number of entries to return.
49
+ score_threshold: The minimum relevance score for an entry to be returned.
50
+ override_score: If True reranking will override element score.
51
+ llm_options: The options for the LLM.
52
+ """
53
+
54
+ llm_options: LiteLLMOptions | None | NotGiven = NOT_GIVEN
55
+
56
+
57
+ class LLMReranker(Reranker[LLMRerankerOptions]):
58
+ """
59
+ Reranker based on LLM.
60
+ """
61
+
62
+ options_cls: type[LLMRerankerOptions] = LLMRerankerOptions
63
+
64
+ def __init__(
65
+ self,
66
+ llm: LiteLLM,
67
+ *,
68
+ prompt: type[Prompt[RerankerInput, str]] | None = None,
69
+ default_options: LLMRerankerOptions | None = None,
70
+ ) -> None:
71
+ """
72
+ Initialize the LLMReranker instance.
73
+
74
+ Args:
75
+ llm: The LLM instance to handle reranking.
76
+ prompt: The prompt to use for reranking elements.
77
+ default_options: The default options for reranking.
78
+ """
79
+ super().__init__(default_options=default_options)
80
+ self._llm = llm
81
+ self._prompt = prompt or RerankerPrompt
82
+ self._llm_options = LiteLLMOptions(
83
+ temperature=0.0,
84
+ logprobs=True,
85
+ max_tokens=1,
86
+ logit_bias={
87
+ self._llm.get_token_id(" Yes"): 1,
88
+ self._llm.get_token_id(" No"): 1,
89
+ },
90
+ )
91
+
92
+ @classmethod
93
+ def from_config(cls, config: dict) -> Self:
94
+ """
95
+ Initialize the class with the provided configuration.
96
+
97
+ Args:
98
+ config: A dictionary containing configuration details for the class.
99
+
100
+ Returns:
101
+ The initialized instance of LLMReranker.
102
+
103
+ Raises:
104
+ ValidationError: If the configuration doesn't follow the expected format.
105
+ InvalidConfigError: If llm or prompt can't be found or are not the correct type.
106
+ """
107
+ config["llm"] = LLM.subclass_from_config(ObjectConstructionConfig.model_validate(config["llm"]))
108
+ config["prompt"] = import_by_path(config["prompt"]) if "prompt" in config else None
109
+ return super().from_config(config)
110
+
111
+ @traceable
112
+ async def rerank(
113
+ self,
114
+ elements: Sequence[Sequence[Element]],
115
+ query: str,
116
+ options: LLMRerankerOptions | None = None,
117
+ ) -> Sequence[Element]:
118
+ """
119
+ Rerank elements with LLM.
120
+
121
+ Args:
122
+ elements: The elements to rerank.
123
+ query: The query to rerank the elements against.
124
+ options: The options for reranking.
125
+
126
+ Returns:
127
+ The reranked elements.
128
+ """
129
+ merged_options = (self.default_options | options) if options else self.default_options
130
+ llm_options = (
131
+ self._llm_options | merged_options.llm_options if merged_options.llm_options else self._llm_options
132
+ )
133
+
134
+ flat_elements = list(chain.from_iterable(elements))
135
+ scores = await self._score_elements(flat_elements, query, llm_options)
136
+
137
+ scored_elements = list(zip(flat_elements, scores, strict=True))
138
+ scored_elements.sort(key=lambda x: x[1], reverse=True)
139
+
140
+ results = []
141
+ for element, score in scored_elements[: merged_options.top_n or None]:
142
+ if not merged_options.score_threshold or score >= merged_options.score_threshold:
143
+ if merged_options.override_score:
144
+ element.score = score
145
+ results.append(element)
146
+ return results
147
+
148
+ async def _score_elements(
149
+ self,
150
+ elements: Sequence[Element],
151
+ query: str,
152
+ llm_options: LiteLLMOptions,
153
+ ) -> Sequence[float]:
154
+ """
155
+ Score the elements according to their relevance to the query using LLM.
156
+
157
+ Args:
158
+ elements: The elements to rerank.
159
+ query: The query to rerank the elements against.
160
+ llm_options: The LLM options to use for scoring.
161
+
162
+ Returns:
163
+ The elements scores.
164
+ """
165
+ scores = []
166
+ for element in elements:
167
+ if element.text_representation:
168
+ prompt = self._prompt(RerankerInput(query=query, document=element.text_representation))
169
+ response = await self._llm.generate_with_metadata(prompt=prompt, options=llm_options)
170
+ prob = math.exp(response.metadata["logprobs"][0]["logprob"])
171
+ score = prob if response.content == "Yes" else 1 - prob
172
+ else:
173
+ score = 0.0
174
+
175
+ scores.append(score)
176
+
177
+ return scores
@@ -0,0 +1,34 @@
1
+ from collections.abc import Sequence
2
+ from itertools import chain
3
+
4
+ from ragbits.core.audit.traces import traceable
5
+ from ragbits.document_search.documents.element import Element
6
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
7
+
8
+
9
+ class NoopReranker(Reranker[RerankerOptions]):
10
+ """
11
+ A no-op reranker that does not change the order of the elements.
12
+ """
13
+
14
+ options_cls: type[RerankerOptions] = RerankerOptions
15
+
16
+ @traceable
17
+ async def rerank( # noqa: PLR6301
18
+ self,
19
+ elements: Sequence[Sequence[Element]],
20
+ query: str,
21
+ options: RerankerOptions | None = None,
22
+ ) -> Sequence[Element]:
23
+ """
24
+ No reranking, returning the elements in the same order.
25
+
26
+ Args:
27
+ elements: The elements to rerank.
28
+ query: The query to rerank the elements against.
29
+ options: The options for reranking.
30
+
31
+ Returns:
32
+ The reranked elements.
33
+ """
34
+ return [*{element.id: element for element in chain.from_iterable(elements)}.values()]
@@ -0,0 +1,73 @@
1
+ from collections import defaultdict
2
+ from collections.abc import Sequence
3
+
4
+ from ragbits.core.audit.traces import traceable
5
+ from ragbits.document_search.documents.element import Element
6
+ from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
7
+
8
+
9
+ class ReciprocalRankFusionReranker(Reranker[RerankerOptions]):
10
+ """
11
+ A reranker that implements the Reciprocal Rank Fusion (RRF) algorithm to
12
+ combine multiple ranked result sets into a single reranked list.
13
+
14
+ RRF is a method that assigns scores to documents based on their positions
15
+ in multiple ranked lists, allowing for fusion of diverse ranking sources
16
+ without the need for tuning.
17
+
18
+ The score for each document is calculated using the formula:
19
+
20
+ score = sum(1.0 / (k + rank(q, d)))
21
+
22
+ where:
23
+ - k is a ranking constant (1 is used here)
24
+ - q is a query in the set of queries
25
+ - d is a document in the result set
26
+ - rank(q, d) is the position of d in the ranking list for q (starting from 1)
27
+ """
28
+
29
+ options_cls: type[RerankerOptions] = RerankerOptions
30
+
31
+ @traceable
32
+ async def rerank(
33
+ self,
34
+ elements: Sequence[Sequence[Element]],
35
+ query: str,
36
+ options: RerankerOptions | None = None,
37
+ ) -> Sequence[Element]:
38
+ """
39
+ Reranks elements using the Reciprocal Rank Fusion (RRF) algorithm.
40
+
41
+ Args:
42
+ elements: A list of ranked lists of elements to be fused.
43
+ query: The query string for reranking.
44
+ options: The options for reranking.
45
+
46
+ Returns:
47
+ The reranked elements.
48
+ """
49
+ if len(elements) == 1:
50
+ return elements[0]
51
+
52
+ merged_options = (self.default_options | options) if options else self.default_options
53
+
54
+ scores: dict[str, float] = defaultdict(float)
55
+ elements_map: dict[str, Element] = {}
56
+
57
+ for query_elements in elements:
58
+ for rank, element in enumerate(query_elements):
59
+ if not element.key:
60
+ continue
61
+ scores[element.key] += 1 / (rank + 1 + 1)
62
+ elements_map[element.key] = element
63
+
64
+ sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
65
+
66
+ results = []
67
+ for element_id, score in sorted_scores[: merged_options.top_n or None]:
68
+ if not merged_options.score_threshold or score >= merged_options.score_threshold:
69
+ if merged_options.override_score:
70
+ elements_map[element_id].score = score
71
+ results.append(elements_map[element_id])
72
+
73
+ return results
@@ -0,0 +1,85 @@
1
+ Metadata-Version: 2.4
2
+ Name: ragbits-document-search
3
+ Version: 1.4.0.dev202601310254
4
+ Summary: Document Search module for Ragbits
5
+ Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
+ Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
7
+ Project-URL: Documentation, https://ragbits.deepsense.ai/
8
+ Project-URL: Source, https://github.com/deepsense-ai/ragbits
9
+ Author-email: "deepsense.ai" <ragbits@deepsense.ai>
10
+ License-Expression: MIT
11
+ Keywords: Document Search,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Natural Language :: English
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: docling[easyocr]<3.0.0,>=2.15.1
26
+ Requires-Dist: filetype<2.0.0,>=1.2.0
27
+ Requires-Dist: opencv-python<5.0.0.0,>=4.11.0.86
28
+ Requires-Dist: python-pptx<2.0.0,>=1.0.0
29
+ Requires-Dist: ragbits-core==1.4.0.dev202601310254
30
+ Requires-Dist: rerankers<1.0.0,>=0.6.1
31
+ Provides-Extra: ray
32
+ Requires-Dist: ray[data]<3.0.0,>=2.43.0; extra == 'ray'
33
+ Provides-Extra: unstructured
34
+ Requires-Dist: unstructured-client<1.0.0,>=0.26.0; extra == 'unstructured'
35
+ Requires-Dist: unstructured<1.0.0,>=0.16.9; extra == 'unstructured'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # Ragbits Document Search
39
+
40
+ Ragbits Document Search is a Python package that provides tools for building RAG applications. It helps ingest, index, and search documents to retrieve relevant information for your prompts.
41
+
42
+ ## Installation
43
+
44
+ You can install the latest version of Ragbits Document Search using pip:
45
+
46
+ ```bash
47
+ pip install ragbits-document-search
48
+ ```
49
+
50
+ ## Quickstart
51
+ ```python
52
+ import asyncio
53
+
54
+ from ragbits.core.embeddings import LiteLLMEmbedder
55
+ from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
56
+ from ragbits.document_search import DocumentSearch
57
+
58
+ async def main() -> None:
59
+ """
60
+ Run the example.
61
+ """
62
+ embedder = LiteLLMEmbedder(
63
+ model_name="text-embedding-3-small",
64
+ )
65
+ vector_store = InMemoryVectorStore(embedder=embedder)
66
+ document_search = DocumentSearch(
67
+ vector_store=vector_store,
68
+ )
69
+
70
+ # Ingest all .txt files from the "biographies" directory
71
+ await document_search.ingest("local://biographies/*.txt")
72
+
73
+ # Search the documents for the query
74
+ results = await document_search.search("When was Marie Curie-Sklodowska born?")
75
+ print(results)
76
+
77
+
78
+ if __name__ == "__main__":
79
+ asyncio.run(main())
80
+ ```
81
+
82
+ ## Documentation
83
+ * [Quickstart 2: Adding RAG Capabilities](https://ragbits.deepsense.ai/quickstart/quickstart2_rag/)
84
+ * [How-To Guides - Document Search](https://ragbits.deepsense.ai/how-to/document_search/async_processing/)
85
+ * [API Reference - Document Search](https://ragbits.deepsense.ai/api_reference/document_search/)
@@ -0,0 +1,46 @@
1
+ ragbits/document_search/__init__.py,sha256=uYbt6kSsfKpL3IY07CCpiRhsG77oZkZjV1pEuwBX1n0,135
2
+ ragbits/document_search/_main.py,sha256=25oeZQ9mWX2tewHi67Wh35wbW3F1LlFthay7HuyL2rI,12331
3
+ ragbits/document_search/cli.py,sha256=GYcm-_tMgW96kBf8Oj88PodjXnWKaXV_OalloK1cU8s,3020
4
+ ragbits/document_search/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ ragbits/document_search/documents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ ragbits/document_search/documents/document.py,sha256=yxi-07fduOeqM0ay9E_BbDm11nGfqtCxuM13cSAfd74,5158
7
+ ragbits/document_search/documents/element.py,sha256=hQCMyGn4C6bfR7lh1vcKZepQpSySLpaA7Zz3D33GHjA,5898
8
+ ragbits/document_search/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ ragbits/document_search/ingestion/enrichers/__init__.py,sha256=owoQ-Qw76Yow8cNNfzYtgpjzw3SySrLXvuZ4Wv0Iap0,325
10
+ ragbits/document_search/ingestion/enrichers/base.py,sha256=_vPNgJeZvrV3A8HVAZjCi18MGtDtVMaat8rUN9Vq1Do,2307
11
+ ragbits/document_search/ingestion/enrichers/exceptions.py,sha256=f3dFENXD8rBM6yDdxQWJM4-GzJf-RBrJqArKnTS6GwA,1007
12
+ ragbits/document_search/ingestion/enrichers/image.py,sha256=81lC6gO-6xGrdArDsQ6doDVQgZcsHuqQrw3DQ7G4sXw,3512
13
+ ragbits/document_search/ingestion/enrichers/router.py,sha256=pEFO3sRb3eY8DZhWEGOsPv0iSS0lGAkcXuk1VLtjwiI,2913
14
+ ragbits/document_search/ingestion/parsers/__init__.py,sha256=WMkY-JEi_FxbegnhArrLQC2Dh4Mtn5O7Fu_fsVCEngQ,315
15
+ ragbits/document_search/ingestion/parsers/base.py,sha256=b9zAlDnXi81_WkCJSUpsRl7hVRQkmpxcqBnmQ89GCUc,3174
16
+ ragbits/document_search/ingestion/parsers/docling.py,sha256=nAIQEzoBEyYr9IAlovz6FIRDjUIlnkeKgckIvj2R8C8,6785
17
+ ragbits/document_search/ingestion/parsers/exceptions.py,sha256=ymCw6XhBhtF2gqJ9n-QzLCjzuaaS__J_VKllgwyBFO4,1005
18
+ ragbits/document_search/ingestion/parsers/router.py,sha256=C2dQT4s5I-CDuS6DE6chQyLyLXfhPoRDYlINRVM0ph8,3168
19
+ ragbits/document_search/ingestion/parsers/unstructured.py,sha256=yfze3TQYPzMnV3SwhqUg04bTBURfhOZsK-zJ3JOxl1k,9453
20
+ ragbits/document_search/ingestion/parsers/pptx/__init__.py,sha256=X9vfcWO8Ght3U2vOlNZFhE1_PYM0TZdSoTLS8pj98jo,627
21
+ ragbits/document_search/ingestion/parsers/pptx/callbacks.py,sha256=G6-4fg11c_xCXcEmcnplEcxwJm6mRj3e_NxHz45nTKQ,799
22
+ ragbits/document_search/ingestion/parsers/pptx/exceptions.py,sha256=fIdt9osO2p1QFq0yDPDNLLwhRLq9bW-wJg41lgUz7iY,1868
23
+ ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py,sha256=Peg5jYX8ePUS4yDsc1sD-xMHqtPnzGuQ517wtkASccg,3287
24
+ ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py,sha256=raxrMJTFd2S--V6eAosbrhbislAoVZx2LdM8EE-7gww,3038
25
+ ragbits/document_search/ingestion/parsers/pptx/parser.py,sha256=h07vIPFpWLFcKiAY9IxIj-775xHs0QUDb1lTxNpO9u4,3243
26
+ ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py,sha256=xKET2hq_OYnuYS4B8QIln1Pc2iVCr--psfahXtISdA4,2851
27
+ ragbits/document_search/ingestion/strategies/__init__.py,sha256=R7Tx1HKrDE38NqxewCrbvsJ9BTWpvWrjv6RlKUQuJgQ,462
28
+ ragbits/document_search/ingestion/strategies/base.py,sha256=DVrzSZXjiiDqX0ZEy9UO7m96zRu0zOgLpOraFGA6spo,10573
29
+ ragbits/document_search/ingestion/strategies/batched.py,sha256=N1SA4Z5sVOB50CX5i7fpsHlsVX63hf4ApascL3LO4Og,10031
30
+ ragbits/document_search/ingestion/strategies/ray.py,sha256=sNBGBuRDBTfuVSCJXiaPFXjvQkXLfPuD9kOBQnQi8-I,6018
31
+ ragbits/document_search/ingestion/strategies/sequential.py,sha256=H81rULKg4A_di5bxkir4mslbrVWm1TxjdI7dSlaPYE0,880
32
+ ragbits/document_search/retrieval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ ragbits/document_search/retrieval/rephrasers/__init__.py,sha256=RHW1TjSuXwEE6qkUXF2iVnbpcK0iJExOZqihHS_njIQ,578
34
+ ragbits/document_search/retrieval/rephrasers/base.py,sha256=N5egUeRkhz5198LYLZk6zx9KVEvhrj5WjB-7m7etST4,1137
35
+ ragbits/document_search/retrieval/rephrasers/llm.py,sha256=DLAXiFPNExdPGd4Yrg1FxEqVurN99L7qpgrUR_faTjw,5673
36
+ ragbits/document_search/retrieval/rephrasers/noop.py,sha256=4aFtxWCpueVg-uOFc247v3zS_cEB6Bsex37hhSdyxpk,832
37
+ ragbits/document_search/retrieval/rerankers/__init__.py,sha256=QFgBmjkHFjsVS89EpnUoUw36IpjXB45OcLUIcRjCZtE,220
38
+ ragbits/document_search/retrieval/rerankers/answerai.py,sha256=3bx0ao19pXa3vcgptfCbLxmmQL4KimV8AxVtj8kaYqk,2987
39
+ ragbits/document_search/retrieval/rerankers/base.py,sha256=oTHFI4hBNJvPL9s0flZYhwG5xZ9FIflThSuuy5JtgpA,1649
40
+ ragbits/document_search/retrieval/rerankers/litellm.py,sha256=C5pum0YGrQPT3raUD7hYVmAOM5bNHa8MsgWZLoEKTp4,2898
41
+ ragbits/document_search/retrieval/rerankers/llm.py,sha256=YcKu4OqPhChfL1BUygim0xBLtiHOOJqI4oNF6ilYtPc,5902
42
+ ragbits/document_search/retrieval/rerankers/noop.py,sha256=9v381IrfLbNYHQaecTNkQHN6rLZx-WB5QqqA5XKrF20,1072
43
+ ragbits/document_search/retrieval/rerankers/rrf.py,sha256=y281Rucv3fuyMNLjJrikqD5--wLu0KQcwbeI6WxmPao,2628
44
+ ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA,sha256=joisMG3fGB6M4XETPZyxUeFH8Ts0hMjUg5VzWw85enc,3261
45
+ ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
46
+ ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any